[llvm] [RISCV] Lower i8/i16/i32 scalable vector ISD::CLMUL/CLMULH with Zvbc32e. (PR #184465)

Craig Topper via llvm-commits llvm-commits at lists.llvm.org
Tue Mar 3 21:28:04 PST 2026


https://github.com/topperc updated https://github.com/llvm/llvm-project/pull/184465

>From 55d6448f6245320ff2a4d0dff879c0381f0d81cf Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper at sifive.com>
Date: Tue, 3 Mar 2026 15:10:05 -0800
Subject: [PATCH 1/3] Pre-commit tests

---
 llvm/test/CodeGen/RISCV/rvv/clmul-sdnode.ll  | 16256 ++++++-
 llvm/test/CodeGen/RISCV/rvv/clmulh-sdnode.ll | 40759 ++++++++++++++++-
 2 files changed, 55579 insertions(+), 1436 deletions(-)

diff --git a/llvm/test/CodeGen/RISCV/rvv/clmul-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/clmul-sdnode.ll
index 945419f895c2b..0ab98e756c6ed 100644
--- a/llvm/test/CodeGen/RISCV/rvv/clmul-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/clmul-sdnode.ll
@@ -1,8 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=riscv32 -verify-machineinstrs -mattr=+v < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32V
 ; RUN: llc -mtriple=riscv64 -verify-machineinstrs -mattr=+v < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64V
-; RUN: llc -mtriple=riscv32 -verify-machineinstrs -mattr=+v,+zvbc < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32ZVBC
-; RUN: llc -mtriple=riscv64 -verify-machineinstrs -mattr=+v,+zvbc < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64ZVBC
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs -mattr=+v,+zvbc < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32ZVBC,RV32ZVBC64
+; RUN: llc -mtriple=riscv64 -verify-machineinstrs -mattr=+v,+zvbc < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64ZVBC,RV64ZVBC64
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs -mattr=+v,+experimental-zvbc32e < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32ZVBC,RV32ZVBC32
+; RUN: llc -mtriple=riscv64 -verify-machineinstrs -mattr=+v,+experimental-zvbc32e < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64ZVBC,RV64ZVBC32
 
 define <vscale x 1 x i8> @clmul_nxv1i8_vv(<vscale x 1 x i8> %va, <vscale x 1 x i8> %vb) nounwind {
 ; RV32V-LABEL: clmul_nxv1i8_vv:
@@ -69,33 +71,97 @@ define <vscale x 1 x i8> @clmul_nxv1i8_vv(<vscale x 1 x i8> %va, <vscale x 1 x i
 ; RV64V-NEXT:    vxor.vv v8, v10, v8
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmul_nxv1i8_vv:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV32ZVBC-NEXT:    vzext.vf8 v10, v9
-; RV32ZVBC-NEXT:    vzext.vf8 v9, v8
-; RV32ZVBC-NEXT:    vclmul.vv v8, v9, v10
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v8, v8, 0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v8, v8, 0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v8, v8, 0
-; RV32ZVBC-NEXT:    ret
+; RV32ZVBC64-LABEL: clmul_nxv1i8_vv:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf8 v10, v9
+; RV32ZVBC64-NEXT:    vzext.vf8 v9, v8
+; RV32ZVBC64-NEXT:    vclmul.vv v8, v9, v10
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v8, 0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v8, 0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v8, 0
+; RV32ZVBC64-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmul_nxv1i8_vv:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV64ZVBC-NEXT:    vzext.vf8 v10, v9
-; RV64ZVBC-NEXT:    vzext.vf8 v9, v8
-; RV64ZVBC-NEXT:    vclmul.vv v8, v9, v10
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v8, v8, 0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v8, v8, 0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v8, v8, 0
-; RV64ZVBC-NEXT:    ret
+; RV64ZVBC64-LABEL: clmul_nxv1i8_vv:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf8 v10, v9
+; RV64ZVBC64-NEXT:    vzext.vf8 v9, v8
+; RV64ZVBC64-NEXT:    vclmul.vv v8, v9, v10
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v8, 0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v8, 0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v8, 0
+; RV64ZVBC64-NEXT:    ret
+;
+; RV32ZVBC32-LABEL: clmul_nxv1i8_vv:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
+; RV32ZVBC32-NEXT:    vand.vi v10, v9, 2
+; RV32ZVBC32-NEXT:    vand.vi v11, v9, 1
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v11, v10
+; RV32ZVBC32-NEXT:    vand.vi v11, v9, 4
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vi v11, v9, 8
+; RV32ZVBC32-NEXT:    li a0, 16
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 32
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 64
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 128
+; RV32ZVBC32-NEXT:    vand.vx v9, v9, a0
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vmul.vv v8, v8, v9
+; RV32ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmul_nxv1i8_vv:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
+; RV64ZVBC32-NEXT:    vand.vi v10, v9, 2
+; RV64ZVBC32-NEXT:    vand.vi v11, v9, 1
+; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v11, v10
+; RV64ZVBC32-NEXT:    vand.vi v11, v9, 4
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vi v11, v9, 8
+; RV64ZVBC32-NEXT:    li a0, 16
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 32
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 64
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 128
+; RV64ZVBC32-NEXT:    vand.vx v9, v9, a0
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vmul.vv v8, v8, v9
+; RV64ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    ret
   %v = call <vscale x 1 x i8> @llvm.clmul.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %vb)
   ret <vscale x 1 x i8> %v
 }
@@ -167,37 +233,103 @@ define <vscale x 1 x i8> @clmul_nxv1i8_vx(<vscale x 1 x i8> %va, i8 %b) nounwind
 ; RV64V-NEXT:    vxor.vv v8, v10, v8
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmul_nxv1i8_vx:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
-; RV32ZVBC-NEXT:    vmv.v.x v9, a0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
-; RV32ZVBC-NEXT:    vzext.vf8 v10, v8
-; RV32ZVBC-NEXT:    vzext.vf8 v8, v9
-; RV32ZVBC-NEXT:    vclmul.vv v8, v10, v8
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v8, v8, 0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v8, v8, 0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v8, v8, 0
-; RV32ZVBC-NEXT:    ret
+; RV32ZVBC64-LABEL: clmul_nxv1i8_vx:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
+; RV32ZVBC64-NEXT:    vmv.v.x v9, a0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf8 v10, v8
+; RV32ZVBC64-NEXT:    vzext.vf8 v8, v9
+; RV32ZVBC64-NEXT:    vclmul.vv v8, v10, v8
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v8, 0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v8, 0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v8, 0
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmul_nxv1i8_vx:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
+; RV64ZVBC64-NEXT:    vmv.v.x v9, a0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf8 v10, v8
+; RV64ZVBC64-NEXT:    vzext.vf8 v8, v9
+; RV64ZVBC64-NEXT:    vclmul.vv v8, v10, v8
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v8, 0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v8, 0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v8, 0
+; RV64ZVBC64-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmul_nxv1i8_vx:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
-; RV64ZVBC-NEXT:    vmv.v.x v9, a0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
-; RV64ZVBC-NEXT:    vzext.vf8 v10, v8
-; RV64ZVBC-NEXT:    vzext.vf8 v8, v9
-; RV64ZVBC-NEXT:    vclmul.vv v8, v10, v8
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v8, v8, 0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v8, v8, 0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v8, v8, 0
-; RV64ZVBC-NEXT:    ret
+; RV32ZVBC32-LABEL: clmul_nxv1i8_vx:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
+; RV32ZVBC32-NEXT:    vmv.v.x v9, a0
+; RV32ZVBC32-NEXT:    li a0, 16
+; RV32ZVBC32-NEXT:    vand.vi v10, v9, 2
+; RV32ZVBC32-NEXT:    vand.vi v11, v9, 1
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v11, v10
+; RV32ZVBC32-NEXT:    vand.vi v11, v9, 4
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vi v11, v9, 8
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 32
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 64
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 128
+; RV32ZVBC32-NEXT:    vand.vx v9, v9, a0
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vmul.vv v8, v8, v9
+; RV32ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmul_nxv1i8_vx:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
+; RV64ZVBC32-NEXT:    vmv.v.x v9, a0
+; RV64ZVBC32-NEXT:    li a0, 16
+; RV64ZVBC32-NEXT:    vand.vi v10, v9, 2
+; RV64ZVBC32-NEXT:    vand.vi v11, v9, 1
+; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v11, v10
+; RV64ZVBC32-NEXT:    vand.vi v11, v9, 4
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vi v11, v9, 8
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 32
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 64
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 128
+; RV64ZVBC32-NEXT:    vand.vx v9, v9, a0
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vmul.vv v8, v8, v9
+; RV64ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x i8> poison, i8 %b, i32 0
   %vb = shufflevector <vscale x 1 x i8> %elt.head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
   %v = call <vscale x 1 x i8> @llvm.clmul.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %vb)
@@ -269,33 +401,97 @@ define <vscale x 2 x i8> @clmul_nxv2i8_vv(<vscale x 2 x i8> %va, <vscale x 2 x i
 ; RV64V-NEXT:    vxor.vv v8, v10, v8
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmul_nxv2i8_vv:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV32ZVBC-NEXT:    vzext.vf8 v10, v9
-; RV32ZVBC-NEXT:    vzext.vf8 v12, v8
-; RV32ZVBC-NEXT:    vclmul.vv v8, v12, v10
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v10, v8, 0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v8, v10, 0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v8, v8, 0
-; RV32ZVBC-NEXT:    ret
+; RV32ZVBC64-LABEL: clmul_nxv2i8_vv:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf8 v10, v9
+; RV32ZVBC64-NEXT:    vzext.vf8 v12, v8
+; RV32ZVBC64-NEXT:    vclmul.vv v8, v12, v10
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v10, v8, 0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v10, 0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v8, 0
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmul_nxv2i8_vv:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf8 v10, v9
+; RV64ZVBC64-NEXT:    vzext.vf8 v12, v8
+; RV64ZVBC64-NEXT:    vclmul.vv v8, v12, v10
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v10, v8, 0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v10, 0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v8, 0
+; RV64ZVBC64-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmul_nxv2i8_vv:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV64ZVBC-NEXT:    vzext.vf8 v10, v9
-; RV64ZVBC-NEXT:    vzext.vf8 v12, v8
-; RV64ZVBC-NEXT:    vclmul.vv v8, v12, v10
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v10, v8, 0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v8, v10, 0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v8, v8, 0
-; RV64ZVBC-NEXT:    ret
+; RV32ZVBC32-LABEL: clmul_nxv2i8_vv:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
+; RV32ZVBC32-NEXT:    vand.vi v10, v9, 2
+; RV32ZVBC32-NEXT:    vand.vi v11, v9, 1
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v11, v10
+; RV32ZVBC32-NEXT:    vand.vi v11, v9, 4
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vi v11, v9, 8
+; RV32ZVBC32-NEXT:    li a0, 16
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 32
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 64
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 128
+; RV32ZVBC32-NEXT:    vand.vx v9, v9, a0
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vmul.vv v8, v8, v9
+; RV32ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmul_nxv2i8_vv:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
+; RV64ZVBC32-NEXT:    vand.vi v10, v9, 2
+; RV64ZVBC32-NEXT:    vand.vi v11, v9, 1
+; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v11, v10
+; RV64ZVBC32-NEXT:    vand.vi v11, v9, 4
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vi v11, v9, 8
+; RV64ZVBC32-NEXT:    li a0, 16
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 32
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 64
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 128
+; RV64ZVBC32-NEXT:    vand.vx v9, v9, a0
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vmul.vv v8, v8, v9
+; RV64ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    ret
   %v = call <vscale x 2 x i8> @llvm.clmul.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %vb)
   ret <vscale x 2 x i8> %v
 }
@@ -367,37 +563,103 @@ define <vscale x 2 x i8> @clmul_nxv2i8_vx(<vscale x 2 x i8> %va, i8 %b) nounwind
 ; RV64V-NEXT:    vxor.vv v8, v10, v8
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmul_nxv2i8_vx:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
-; RV32ZVBC-NEXT:    vmv.v.x v12, a0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; RV32ZVBC-NEXT:    vzext.vf8 v10, v8
-; RV32ZVBC-NEXT:    vzext.vf8 v8, v12
-; RV32ZVBC-NEXT:    vclmul.vv v8, v10, v8
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v10, v8, 0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v8, v10, 0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v8, v8, 0
-; RV32ZVBC-NEXT:    ret
+; RV32ZVBC64-LABEL: clmul_nxv2i8_vx:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
+; RV32ZVBC64-NEXT:    vmv.v.x v12, a0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf8 v10, v8
+; RV32ZVBC64-NEXT:    vzext.vf8 v8, v12
+; RV32ZVBC64-NEXT:    vclmul.vv v8, v10, v8
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v10, v8, 0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v10, 0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v8, 0
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmul_nxv2i8_vx:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
+; RV64ZVBC64-NEXT:    vmv.v.x v12, a0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf8 v10, v8
+; RV64ZVBC64-NEXT:    vzext.vf8 v8, v12
+; RV64ZVBC64-NEXT:    vclmul.vv v8, v10, v8
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v10, v8, 0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v10, 0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v8, 0
+; RV64ZVBC64-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmul_nxv2i8_vx:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
-; RV64ZVBC-NEXT:    vmv.v.x v12, a0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; RV64ZVBC-NEXT:    vzext.vf8 v10, v8
-; RV64ZVBC-NEXT:    vzext.vf8 v8, v12
-; RV64ZVBC-NEXT:    vclmul.vv v8, v10, v8
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v10, v8, 0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v8, v10, 0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v8, v8, 0
-; RV64ZVBC-NEXT:    ret
+; RV32ZVBC32-LABEL: clmul_nxv2i8_vx:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
+; RV32ZVBC32-NEXT:    vmv.v.x v9, a0
+; RV32ZVBC32-NEXT:    li a0, 16
+; RV32ZVBC32-NEXT:    vand.vi v10, v9, 2
+; RV32ZVBC32-NEXT:    vand.vi v11, v9, 1
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v11, v10
+; RV32ZVBC32-NEXT:    vand.vi v11, v9, 4
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vi v11, v9, 8
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 32
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 64
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 128
+; RV32ZVBC32-NEXT:    vand.vx v9, v9, a0
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vmul.vv v8, v8, v9
+; RV32ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmul_nxv2i8_vx:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
+; RV64ZVBC32-NEXT:    vmv.v.x v9, a0
+; RV64ZVBC32-NEXT:    li a0, 16
+; RV64ZVBC32-NEXT:    vand.vi v10, v9, 2
+; RV64ZVBC32-NEXT:    vand.vi v11, v9, 1
+; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v11, v10
+; RV64ZVBC32-NEXT:    vand.vi v11, v9, 4
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vi v11, v9, 8
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 32
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 64
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 128
+; RV64ZVBC32-NEXT:    vand.vx v9, v9, a0
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vmul.vv v8, v8, v9
+; RV64ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x i8> poison, i8 %b, i32 0
   %vb = shufflevector <vscale x 2 x i8> %elt.head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
   %v = call <vscale x 2 x i8> @llvm.clmul.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %vb)
@@ -469,33 +731,97 @@ define <vscale x 4 x i8> @clmul_nxv4i8_vv(<vscale x 4 x i8> %va, <vscale x 4 x i
 ; RV64V-NEXT:    vxor.vv v8, v10, v8
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmul_nxv4i8_vv:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV32ZVBC-NEXT:    vzext.vf8 v12, v9
-; RV32ZVBC-NEXT:    vzext.vf8 v16, v8
-; RV32ZVBC-NEXT:    vclmul.vv v8, v16, v12
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v12, v8, 0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v8, v12, 0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v8, v8, 0
-; RV32ZVBC-NEXT:    ret
+; RV32ZVBC64-LABEL: clmul_nxv4i8_vv:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf8 v12, v9
+; RV32ZVBC64-NEXT:    vzext.vf8 v16, v8
+; RV32ZVBC64-NEXT:    vclmul.vv v8, v16, v12
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v12, v8, 0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v12, 0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v8, 0
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmul_nxv4i8_vv:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf8 v12, v9
+; RV64ZVBC64-NEXT:    vzext.vf8 v16, v8
+; RV64ZVBC64-NEXT:    vclmul.vv v8, v16, v12
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v12, v8, 0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v12, 0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v8, 0
+; RV64ZVBC64-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmul_nxv4i8_vv:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV64ZVBC-NEXT:    vzext.vf8 v12, v9
-; RV64ZVBC-NEXT:    vzext.vf8 v16, v8
-; RV64ZVBC-NEXT:    vclmul.vv v8, v16, v12
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v12, v8, 0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v8, v12, 0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v8, v8, 0
-; RV64ZVBC-NEXT:    ret
+; RV32ZVBC32-LABEL: clmul_nxv4i8_vv:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
+; RV32ZVBC32-NEXT:    vand.vi v10, v9, 2
+; RV32ZVBC32-NEXT:    vand.vi v11, v9, 1
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v11, v10
+; RV32ZVBC32-NEXT:    vand.vi v11, v9, 4
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vi v11, v9, 8
+; RV32ZVBC32-NEXT:    li a0, 16
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 32
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 64
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 128
+; RV32ZVBC32-NEXT:    vand.vx v9, v9, a0
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vmul.vv v8, v8, v9
+; RV32ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmul_nxv4i8_vv:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
+; RV64ZVBC32-NEXT:    vand.vi v10, v9, 2
+; RV64ZVBC32-NEXT:    vand.vi v11, v9, 1
+; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v11, v10
+; RV64ZVBC32-NEXT:    vand.vi v11, v9, 4
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vi v11, v9, 8
+; RV64ZVBC32-NEXT:    li a0, 16
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 32
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 64
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 128
+; RV64ZVBC32-NEXT:    vand.vx v9, v9, a0
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vmul.vv v8, v8, v9
+; RV64ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    ret
   %v = call <vscale x 4 x i8> @llvm.clmul.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %vb)
   ret <vscale x 4 x i8> %v
 }
@@ -567,37 +893,103 @@ define <vscale x 4 x i8> @clmul_nxv4i8_vx(<vscale x 4 x i8> %va, i8 %b) nounwind
 ; RV64V-NEXT:    vxor.vv v8, v10, v8
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmul_nxv4i8_vx:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
-; RV32ZVBC-NEXT:    vmv.v.x v16, a0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
-; RV32ZVBC-NEXT:    vzext.vf8 v12, v8
-; RV32ZVBC-NEXT:    vzext.vf8 v8, v16
-; RV32ZVBC-NEXT:    vclmul.vv v8, v12, v8
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v12, v8, 0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v8, v12, 0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v8, v8, 0
-; RV32ZVBC-NEXT:    ret
+; RV32ZVBC64-LABEL: clmul_nxv4i8_vx:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; RV32ZVBC64-NEXT:    vmv.v.x v16, a0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf8 v12, v8
+; RV32ZVBC64-NEXT:    vzext.vf8 v8, v16
+; RV32ZVBC64-NEXT:    vclmul.vv v8, v12, v8
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v12, v8, 0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v12, 0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v8, 0
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmul_nxv4i8_vx:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; RV64ZVBC64-NEXT:    vmv.v.x v16, a0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf8 v12, v8
+; RV64ZVBC64-NEXT:    vzext.vf8 v8, v16
+; RV64ZVBC64-NEXT:    vclmul.vv v8, v12, v8
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v12, v8, 0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v12, 0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v8, 0
+; RV64ZVBC64-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmul_nxv4i8_vx:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
-; RV64ZVBC-NEXT:    vmv.v.x v16, a0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
-; RV64ZVBC-NEXT:    vzext.vf8 v12, v8
-; RV64ZVBC-NEXT:    vzext.vf8 v8, v16
-; RV64ZVBC-NEXT:    vclmul.vv v8, v12, v8
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v12, v8, 0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v8, v12, 0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v8, v8, 0
-; RV64ZVBC-NEXT:    ret
+; RV32ZVBC32-LABEL: clmul_nxv4i8_vx:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; RV32ZVBC32-NEXT:    vmv.v.x v9, a0
+; RV32ZVBC32-NEXT:    li a0, 16
+; RV32ZVBC32-NEXT:    vand.vi v10, v9, 2
+; RV32ZVBC32-NEXT:    vand.vi v11, v9, 1
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v11, v10
+; RV32ZVBC32-NEXT:    vand.vi v11, v9, 4
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vi v11, v9, 8
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 32
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 64
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 128
+; RV32ZVBC32-NEXT:    vand.vx v9, v9, a0
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vmul.vv v8, v8, v9
+; RV32ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmul_nxv4i8_vx:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; RV64ZVBC32-NEXT:    vmv.v.x v9, a0
+; RV64ZVBC32-NEXT:    li a0, 16
+; RV64ZVBC32-NEXT:    vand.vi v10, v9, 2
+; RV64ZVBC32-NEXT:    vand.vi v11, v9, 1
+; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v11, v10
+; RV64ZVBC32-NEXT:    vand.vi v11, v9, 4
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vi v11, v9, 8
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 32
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 64
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 128
+; RV64ZVBC32-NEXT:    vand.vx v9, v9, a0
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vmul.vv v8, v8, v9
+; RV64ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x i8> poison, i8 %b, i32 0
   %vb = shufflevector <vscale x 4 x i8> %elt.head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
   %v = call <vscale x 4 x i8> @llvm.clmul.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %vb)
@@ -669,33 +1061,97 @@ define <vscale x 8 x i8> @clmul_nxv8i8_vv(<vscale x 8 x i8> %va, <vscale x 8 x i
 ; RV64V-NEXT:    vxor.vv v8, v10, v8
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmul_nxv8i8_vv:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV32ZVBC-NEXT:    vzext.vf8 v16, v9
-; RV32ZVBC-NEXT:    vzext.vf8 v24, v8
-; RV32ZVBC-NEXT:    vclmul.vv v8, v24, v16
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v16, v8, 0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v10, v16, 0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v8, v10, 0
-; RV32ZVBC-NEXT:    ret
+; RV32ZVBC64-LABEL: clmul_nxv8i8_vv:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf8 v16, v9
+; RV32ZVBC64-NEXT:    vzext.vf8 v24, v8
+; RV32ZVBC64-NEXT:    vclmul.vv v8, v24, v16
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v16, v8, 0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v10, v16, 0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v10, 0
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmul_nxv8i8_vv:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf8 v16, v9
+; RV64ZVBC64-NEXT:    vzext.vf8 v24, v8
+; RV64ZVBC64-NEXT:    vclmul.vv v8, v24, v16
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v16, v8, 0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v10, v16, 0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v10, 0
+; RV64ZVBC64-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmul_nxv8i8_vv:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV64ZVBC-NEXT:    vzext.vf8 v16, v9
-; RV64ZVBC-NEXT:    vzext.vf8 v24, v8
-; RV64ZVBC-NEXT:    vclmul.vv v8, v24, v16
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v16, v8, 0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v10, v16, 0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v8, v10, 0
-; RV64ZVBC-NEXT:    ret
+; RV32ZVBC32-LABEL: clmul_nxv8i8_vv:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; RV32ZVBC32-NEXT:    vand.vi v10, v9, 2
+; RV32ZVBC32-NEXT:    vand.vi v11, v9, 1
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v11, v10
+; RV32ZVBC32-NEXT:    vand.vi v11, v9, 4
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vi v11, v9, 8
+; RV32ZVBC32-NEXT:    li a0, 16
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 32
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 64
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 128
+; RV32ZVBC32-NEXT:    vand.vx v9, v9, a0
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vmul.vv v8, v8, v9
+; RV32ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmul_nxv8i8_vv:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; RV64ZVBC32-NEXT:    vand.vi v10, v9, 2
+; RV64ZVBC32-NEXT:    vand.vi v11, v9, 1
+; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v11, v10
+; RV64ZVBC32-NEXT:    vand.vi v11, v9, 4
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vi v11, v9, 8
+; RV64ZVBC32-NEXT:    li a0, 16
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 32
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 64
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 128
+; RV64ZVBC32-NEXT:    vand.vx v9, v9, a0
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vmul.vv v8, v8, v9
+; RV64ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    ret
   %v = call <vscale x 8 x i8> @llvm.clmul.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %vb)
   ret <vscale x 8 x i8> %v
 }
@@ -767,37 +1223,103 @@ define <vscale x 8 x i8> @clmul_nxv8i8_vx(<vscale x 8 x i8> %va, i8 %b) nounwind
 ; RV64V-NEXT:    vxor.vv v8, v10, v8
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmul_nxv8i8_vx:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
-; RV32ZVBC-NEXT:    vmv.v.x v24, a0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV32ZVBC-NEXT:    vzext.vf8 v16, v8
-; RV32ZVBC-NEXT:    vzext.vf8 v8, v24
-; RV32ZVBC-NEXT:    vclmul.vv v8, v16, v8
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v16, v8, 0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v10, v16, 0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v8, v10, 0
-; RV32ZVBC-NEXT:    ret
+; RV32ZVBC64-LABEL: clmul_nxv8i8_vx:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
+; RV32ZVBC64-NEXT:    vmv.v.x v24, a0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf8 v16, v8
+; RV32ZVBC64-NEXT:    vzext.vf8 v8, v24
+; RV32ZVBC64-NEXT:    vclmul.vv v8, v16, v8
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v16, v8, 0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v10, v16, 0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v10, 0
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmul_nxv8i8_vx:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
+; RV64ZVBC64-NEXT:    vmv.v.x v24, a0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf8 v16, v8
+; RV64ZVBC64-NEXT:    vzext.vf8 v8, v24
+; RV64ZVBC64-NEXT:    vclmul.vv v8, v16, v8
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v16, v8, 0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v10, v16, 0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v10, 0
+; RV64ZVBC64-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmul_nxv8i8_vx:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
-; RV64ZVBC-NEXT:    vmv.v.x v24, a0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV64ZVBC-NEXT:    vzext.vf8 v16, v8
-; RV64ZVBC-NEXT:    vzext.vf8 v8, v24
-; RV64ZVBC-NEXT:    vclmul.vv v8, v16, v8
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v16, v8, 0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v10, v16, 0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v8, v10, 0
-; RV64ZVBC-NEXT:    ret
+; RV32ZVBC32-LABEL: clmul_nxv8i8_vx:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
+; RV32ZVBC32-NEXT:    vmv.v.x v9, a0
+; RV32ZVBC32-NEXT:    li a0, 16
+; RV32ZVBC32-NEXT:    vand.vi v10, v9, 2
+; RV32ZVBC32-NEXT:    vand.vi v11, v9, 1
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v11, v10
+; RV32ZVBC32-NEXT:    vand.vi v11, v9, 4
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vi v11, v9, 8
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 32
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 64
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 128
+; RV32ZVBC32-NEXT:    vand.vx v9, v9, a0
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vmul.vv v8, v8, v9
+; RV32ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmul_nxv8i8_vx:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
+; RV64ZVBC32-NEXT:    vmv.v.x v9, a0
+; RV64ZVBC32-NEXT:    li a0, 16
+; RV64ZVBC32-NEXT:    vand.vi v10, v9, 2
+; RV64ZVBC32-NEXT:    vand.vi v11, v9, 1
+; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v11, v10
+; RV64ZVBC32-NEXT:    vand.vi v11, v9, 4
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vi v11, v9, 8
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 32
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 64
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 128
+; RV64ZVBC32-NEXT:    vand.vx v9, v9, a0
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vmul.vv v8, v8, v9
+; RV64ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x i8> poison, i8 %b, i32 0
   %vb = shufflevector <vscale x 8 x i8> %elt.head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
   %v = call <vscale x 8 x i8> @llvm.clmul.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %vb)
@@ -1160,29 +1682,159 @@ define <vscale x 1 x i16> @clmul_nxv1i16_vv(<vscale x 1 x i16> %va, <vscale x 1
 ; RV64V-NEXT:    vxor.vv v8, v10, v8
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmul_nxv1i16_vv:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV32ZVBC-NEXT:    vzext.vf4 v10, v9
-; RV32ZVBC-NEXT:    vzext.vf4 v9, v8
-; RV32ZVBC-NEXT:    vclmul.vv v8, v9, v10
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v8, v8, 0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v8, v8, 0
-; RV32ZVBC-NEXT:    ret
+; RV32ZVBC64-LABEL: clmul_nxv1i16_vv:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf4 v10, v9
+; RV32ZVBC64-NEXT:    vzext.vf4 v9, v8
+; RV32ZVBC64-NEXT:    vclmul.vv v8, v9, v10
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v8, 0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v8, 0
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmul_nxv1i16_vv:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf4 v10, v9
+; RV64ZVBC64-NEXT:    vzext.vf4 v9, v8
+; RV64ZVBC64-NEXT:    vclmul.vv v8, v9, v10
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v8, 0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v8, 0
+; RV64ZVBC64-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmul_nxv1i16_vv:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV64ZVBC-NEXT:    vzext.vf4 v10, v9
-; RV64ZVBC-NEXT:    vzext.vf4 v9, v8
-; RV64ZVBC-NEXT:    vclmul.vv v8, v9, v10
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v8, v8, 0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v8, v8, 0
-; RV64ZVBC-NEXT:    ret
+; RV32ZVBC32-LABEL: clmul_nxv1i16_vv:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; RV32ZVBC32-NEXT:    vand.vi v10, v9, 2
+; RV32ZVBC32-NEXT:    vand.vi v11, v9, 1
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v11, v10
+; RV32ZVBC32-NEXT:    vand.vi v11, v9, 4
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vi v11, v9, 8
+; RV32ZVBC32-NEXT:    li a0, 16
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 32
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 64
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 128
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 256
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 512
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 1024
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 1
+; RV32ZVBC32-NEXT:    slli a0, a0, 11
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    lui a0, 1
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    lui a0, 2
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    lui a0, 4
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    lui a0, 8
+; RV32ZVBC32-NEXT:    vand.vx v9, v9, a0
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vmul.vv v8, v8, v9
+; RV32ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmul_nxv1i16_vv:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; RV64ZVBC32-NEXT:    vand.vi v10, v9, 2
+; RV64ZVBC32-NEXT:    vand.vi v11, v9, 1
+; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v11, v10
+; RV64ZVBC32-NEXT:    vand.vi v11, v9, 4
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vi v11, v9, 8
+; RV64ZVBC32-NEXT:    li a0, 16
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 32
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 64
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 128
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 256
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 512
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 1024
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 1
+; RV64ZVBC32-NEXT:    slli a0, a0, 11
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 1
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 2
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 4
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 8
+; RV64ZVBC32-NEXT:    vand.vx v9, v9, a0
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vmul.vv v8, v8, v9
+; RV64ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    ret
   %v = call <vscale x 1 x i16> @llvm.clmul.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %vb)
   ret <vscale x 1 x i16> %v
 }
@@ -1320,33 +1972,165 @@ define <vscale x 1 x i16> @clmul_nxv1i16_vx(<vscale x 1 x i16> %va, i16 %b) noun
 ; RV64V-NEXT:    vxor.vv v8, v10, v8
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmul_nxv1i16_vx:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; RV32ZVBC-NEXT:    vmv.v.x v9, a0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
-; RV32ZVBC-NEXT:    vzext.vf4 v10, v8
-; RV32ZVBC-NEXT:    vzext.vf4 v8, v9
-; RV32ZVBC-NEXT:    vclmul.vv v8, v10, v8
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v8, v8, 0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v8, v8, 0
-; RV32ZVBC-NEXT:    ret
+; RV32ZVBC64-LABEL: clmul_nxv1i16_vx:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; RV32ZVBC64-NEXT:    vmv.v.x v9, a0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf4 v10, v8
+; RV32ZVBC64-NEXT:    vzext.vf4 v8, v9
+; RV32ZVBC64-NEXT:    vclmul.vv v8, v10, v8
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v8, 0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v8, 0
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmul_nxv1i16_vx:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; RV64ZVBC64-NEXT:    vmv.v.x v9, a0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf4 v10, v8
+; RV64ZVBC64-NEXT:    vzext.vf4 v8, v9
+; RV64ZVBC64-NEXT:    vclmul.vv v8, v10, v8
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v8, 0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v8, 0
+; RV64ZVBC64-NEXT:    ret
+;
+; RV32ZVBC32-LABEL: clmul_nxv1i16_vx:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; RV32ZVBC32-NEXT:    vmv.v.x v9, a0
+; RV32ZVBC32-NEXT:    li a0, 16
+; RV32ZVBC32-NEXT:    vand.vi v10, v9, 2
+; RV32ZVBC32-NEXT:    vand.vi v11, v9, 1
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v11, v10
+; RV32ZVBC32-NEXT:    vand.vi v11, v9, 4
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vi v11, v9, 8
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 32
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 64
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 128
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 256
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 512
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 1024
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 1
+; RV32ZVBC32-NEXT:    slli a0, a0, 11
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    lui a0, 1
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    lui a0, 2
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    lui a0, 4
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    lui a0, 8
+; RV32ZVBC32-NEXT:    vand.vx v9, v9, a0
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vmul.vv v8, v8, v9
+; RV32ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmul_nxv1i16_vx:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; RV64ZVBC-NEXT:    vmv.v.x v9, a0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
-; RV64ZVBC-NEXT:    vzext.vf4 v10, v8
-; RV64ZVBC-NEXT:    vzext.vf4 v8, v9
-; RV64ZVBC-NEXT:    vclmul.vv v8, v10, v8
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v8, v8, 0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v8, v8, 0
-; RV64ZVBC-NEXT:    ret
+; RV64ZVBC32-LABEL: clmul_nxv1i16_vx:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; RV64ZVBC32-NEXT:    vmv.v.x v9, a0
+; RV64ZVBC32-NEXT:    li a0, 16
+; RV64ZVBC32-NEXT:    vand.vi v10, v9, 2
+; RV64ZVBC32-NEXT:    vand.vi v11, v9, 1
+; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v11, v10
+; RV64ZVBC32-NEXT:    vand.vi v11, v9, 4
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vi v11, v9, 8
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 32
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 64
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 128
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 256
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 512
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 1024
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 1
+; RV64ZVBC32-NEXT:    slli a0, a0, 11
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 1
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 2
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 4
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 8
+; RV64ZVBC32-NEXT:    vand.vx v9, v9, a0
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vmul.vv v8, v8, v9
+; RV64ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x i16> poison, i16 %b, i32 0
   %vb = shufflevector <vscale x 1 x i16> %elt.head, <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer
   %v = call <vscale x 1 x i16> @llvm.clmul.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %vb)
@@ -1484,29 +2268,159 @@ define <vscale x 2 x i16> @clmul_nxv2i16_vv(<vscale x 2 x i16> %va, <vscale x 2
 ; RV64V-NEXT:    vxor.vv v8, v10, v8
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmul_nxv2i16_vv:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV32ZVBC-NEXT:    vzext.vf4 v10, v9
-; RV32ZVBC-NEXT:    vzext.vf4 v12, v8
-; RV32ZVBC-NEXT:    vclmul.vv v8, v12, v10
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v10, v8, 0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v8, v10, 0
-; RV32ZVBC-NEXT:    ret
+; RV32ZVBC64-LABEL: clmul_nxv2i16_vv:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf4 v10, v9
+; RV32ZVBC64-NEXT:    vzext.vf4 v12, v8
+; RV32ZVBC64-NEXT:    vclmul.vv v8, v12, v10
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v10, v8, 0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v10, 0
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmul_nxv2i16_vv:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf4 v10, v9
+; RV64ZVBC64-NEXT:    vzext.vf4 v12, v8
+; RV64ZVBC64-NEXT:    vclmul.vv v8, v12, v10
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v10, v8, 0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v10, 0
+; RV64ZVBC64-NEXT:    ret
+;
+; RV32ZVBC32-LABEL: clmul_nxv2i16_vv:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; RV32ZVBC32-NEXT:    vand.vi v10, v9, 2
+; RV32ZVBC32-NEXT:    vand.vi v11, v9, 1
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v11, v10
+; RV32ZVBC32-NEXT:    vand.vi v11, v9, 4
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vi v11, v9, 8
+; RV32ZVBC32-NEXT:    li a0, 16
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 32
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 64
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 128
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 256
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 512
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 1024
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 1
+; RV32ZVBC32-NEXT:    slli a0, a0, 11
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    lui a0, 1
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    lui a0, 2
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    lui a0, 4
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    lui a0, 8
+; RV32ZVBC32-NEXT:    vand.vx v9, v9, a0
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vmul.vv v8, v8, v9
+; RV32ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmul_nxv2i16_vv:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV64ZVBC-NEXT:    vzext.vf4 v10, v9
-; RV64ZVBC-NEXT:    vzext.vf4 v12, v8
-; RV64ZVBC-NEXT:    vclmul.vv v8, v12, v10
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v10, v8, 0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v8, v10, 0
-; RV64ZVBC-NEXT:    ret
+; RV64ZVBC32-LABEL: clmul_nxv2i16_vv:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; RV64ZVBC32-NEXT:    vand.vi v10, v9, 2
+; RV64ZVBC32-NEXT:    vand.vi v11, v9, 1
+; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v11, v10
+; RV64ZVBC32-NEXT:    vand.vi v11, v9, 4
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vi v11, v9, 8
+; RV64ZVBC32-NEXT:    li a0, 16
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 32
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 64
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 128
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 256
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 512
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 1024
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 1
+; RV64ZVBC32-NEXT:    slli a0, a0, 11
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 1
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 2
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 4
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 8
+; RV64ZVBC32-NEXT:    vand.vx v9, v9, a0
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vmul.vv v8, v8, v9
+; RV64ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    ret
   %v = call <vscale x 2 x i16> @llvm.clmul.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %vb)
   ret <vscale x 2 x i16> %v
 }
@@ -1644,33 +2558,165 @@ define <vscale x 2 x i16> @clmul_nxv2i16_vx(<vscale x 2 x i16> %va, i16 %b) noun
 ; RV64V-NEXT:    vxor.vv v8, v10, v8
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmul_nxv2i16_vx:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; RV32ZVBC-NEXT:    vmv.v.x v12, a0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; RV32ZVBC-NEXT:    vzext.vf4 v10, v8
-; RV32ZVBC-NEXT:    vzext.vf4 v8, v12
-; RV32ZVBC-NEXT:    vclmul.vv v8, v10, v8
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v10, v8, 0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v8, v10, 0
-; RV32ZVBC-NEXT:    ret
+; RV32ZVBC64-LABEL: clmul_nxv2i16_vx:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; RV32ZVBC64-NEXT:    vmv.v.x v12, a0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf4 v10, v8
+; RV32ZVBC64-NEXT:    vzext.vf4 v8, v12
+; RV32ZVBC64-NEXT:    vclmul.vv v8, v10, v8
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v10, v8, 0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v10, 0
+; RV32ZVBC64-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmul_nxv2i16_vx:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; RV64ZVBC-NEXT:    vmv.v.x v12, a0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; RV64ZVBC-NEXT:    vzext.vf4 v10, v8
-; RV64ZVBC-NEXT:    vzext.vf4 v8, v12
-; RV64ZVBC-NEXT:    vclmul.vv v8, v10, v8
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v10, v8, 0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v8, v10, 0
-; RV64ZVBC-NEXT:    ret
+; RV64ZVBC64-LABEL: clmul_nxv2i16_vx:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; RV64ZVBC64-NEXT:    vmv.v.x v12, a0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf4 v10, v8
+; RV64ZVBC64-NEXT:    vzext.vf4 v8, v12
+; RV64ZVBC64-NEXT:    vclmul.vv v8, v10, v8
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v10, v8, 0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v10, 0
+; RV64ZVBC64-NEXT:    ret
+;
+; RV32ZVBC32-LABEL: clmul_nxv2i16_vx:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; RV32ZVBC32-NEXT:    vmv.v.x v9, a0
+; RV32ZVBC32-NEXT:    li a0, 16
+; RV32ZVBC32-NEXT:    vand.vi v10, v9, 2
+; RV32ZVBC32-NEXT:    vand.vi v11, v9, 1
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v11, v10
+; RV32ZVBC32-NEXT:    vand.vi v11, v9, 4
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vi v11, v9, 8
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 32
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 64
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 128
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 256
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 512
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 1024
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 1
+; RV32ZVBC32-NEXT:    slli a0, a0, 11
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    lui a0, 1
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    lui a0, 2
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    lui a0, 4
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    lui a0, 8
+; RV32ZVBC32-NEXT:    vand.vx v9, v9, a0
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vmul.vv v8, v8, v9
+; RV32ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmul_nxv2i16_vx:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; RV64ZVBC32-NEXT:    vmv.v.x v9, a0
+; RV64ZVBC32-NEXT:    li a0, 16
+; RV64ZVBC32-NEXT:    vand.vi v10, v9, 2
+; RV64ZVBC32-NEXT:    vand.vi v11, v9, 1
+; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v11, v10
+; RV64ZVBC32-NEXT:    vand.vi v11, v9, 4
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vi v11, v9, 8
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 32
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 64
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 128
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 256
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 512
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 1024
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 1
+; RV64ZVBC32-NEXT:    slli a0, a0, 11
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 1
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 2
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 4
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 8
+; RV64ZVBC32-NEXT:    vand.vx v9, v9, a0
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vmul.vv v8, v8, v9
+; RV64ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x i16> poison, i16 %b, i32 0
   %vb = shufflevector <vscale x 2 x i16> %elt.head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
   %v = call <vscale x 2 x i16> @llvm.clmul.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %vb)
@@ -1808,29 +2854,159 @@ define <vscale x 4 x i16> @clmul_nxv4i16_vv(<vscale x 4 x i16> %va, <vscale x 4
 ; RV64V-NEXT:    vxor.vv v8, v10, v8
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmul_nxv4i16_vv:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV32ZVBC-NEXT:    vzext.vf4 v12, v9
-; RV32ZVBC-NEXT:    vzext.vf4 v16, v8
-; RV32ZVBC-NEXT:    vclmul.vv v8, v16, v12
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v12, v8, 0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v8, v12, 0
-; RV32ZVBC-NEXT:    ret
+; RV32ZVBC64-LABEL: clmul_nxv4i16_vv:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf4 v12, v9
+; RV32ZVBC64-NEXT:    vzext.vf4 v16, v8
+; RV32ZVBC64-NEXT:    vclmul.vv v8, v16, v12
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v12, v8, 0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v12, 0
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmul_nxv4i16_vv:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf4 v12, v9
+; RV64ZVBC64-NEXT:    vzext.vf4 v16, v8
+; RV64ZVBC64-NEXT:    vclmul.vv v8, v16, v12
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v12, v8, 0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v12, 0
+; RV64ZVBC64-NEXT:    ret
+;
+; RV32ZVBC32-LABEL: clmul_nxv4i16_vv:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; RV32ZVBC32-NEXT:    vand.vi v10, v9, 2
+; RV32ZVBC32-NEXT:    vand.vi v11, v9, 1
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v11, v10
+; RV32ZVBC32-NEXT:    vand.vi v11, v9, 4
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vi v11, v9, 8
+; RV32ZVBC32-NEXT:    li a0, 16
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 32
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 64
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 128
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 256
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 512
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 1024
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 1
+; RV32ZVBC32-NEXT:    slli a0, a0, 11
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    lui a0, 1
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    lui a0, 2
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    lui a0, 4
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    lui a0, 8
+; RV32ZVBC32-NEXT:    vand.vx v9, v9, a0
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vmul.vv v8, v8, v9
+; RV32ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmul_nxv4i16_vv:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV64ZVBC-NEXT:    vzext.vf4 v12, v9
-; RV64ZVBC-NEXT:    vzext.vf4 v16, v8
-; RV64ZVBC-NEXT:    vclmul.vv v8, v16, v12
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v12, v8, 0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v8, v12, 0
-; RV64ZVBC-NEXT:    ret
+; RV64ZVBC32-LABEL: clmul_nxv4i16_vv:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; RV64ZVBC32-NEXT:    vand.vi v10, v9, 2
+; RV64ZVBC32-NEXT:    vand.vi v11, v9, 1
+; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v11, v10
+; RV64ZVBC32-NEXT:    vand.vi v11, v9, 4
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vi v11, v9, 8
+; RV64ZVBC32-NEXT:    li a0, 16
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 32
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 64
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 128
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 256
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 512
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 1024
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 1
+; RV64ZVBC32-NEXT:    slli a0, a0, 11
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 1
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 2
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 4
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 8
+; RV64ZVBC32-NEXT:    vand.vx v9, v9, a0
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vmul.vv v8, v8, v9
+; RV64ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    ret
   %v = call <vscale x 4 x i16> @llvm.clmul.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb)
   ret <vscale x 4 x i16> %v
 }
@@ -1968,33 +3144,165 @@ define <vscale x 4 x i16> @clmul_nxv4i16_vx(<vscale x 4 x i16> %va, i16 %b) noun
 ; RV64V-NEXT:    vxor.vv v8, v10, v8
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmul_nxv4i16_vx:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; RV32ZVBC-NEXT:    vmv.v.x v16, a0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
-; RV32ZVBC-NEXT:    vzext.vf4 v12, v8
-; RV32ZVBC-NEXT:    vzext.vf4 v8, v16
-; RV32ZVBC-NEXT:    vclmul.vv v8, v12, v8
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v12, v8, 0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v8, v12, 0
-; RV32ZVBC-NEXT:    ret
+; RV32ZVBC64-LABEL: clmul_nxv4i16_vx:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; RV32ZVBC64-NEXT:    vmv.v.x v16, a0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf4 v12, v8
+; RV32ZVBC64-NEXT:    vzext.vf4 v8, v16
+; RV32ZVBC64-NEXT:    vclmul.vv v8, v12, v8
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v12, v8, 0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v12, 0
+; RV32ZVBC64-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmul_nxv4i16_vx:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; RV64ZVBC-NEXT:    vmv.v.x v16, a0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
-; RV64ZVBC-NEXT:    vzext.vf4 v12, v8
-; RV64ZVBC-NEXT:    vzext.vf4 v8, v16
-; RV64ZVBC-NEXT:    vclmul.vv v8, v12, v8
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v12, v8, 0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v8, v12, 0
-; RV64ZVBC-NEXT:    ret
+; RV64ZVBC64-LABEL: clmul_nxv4i16_vx:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; RV64ZVBC64-NEXT:    vmv.v.x v16, a0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf4 v12, v8
+; RV64ZVBC64-NEXT:    vzext.vf4 v8, v16
+; RV64ZVBC64-NEXT:    vclmul.vv v8, v12, v8
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v12, v8, 0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v12, 0
+; RV64ZVBC64-NEXT:    ret
+;
+; RV32ZVBC32-LABEL: clmul_nxv4i16_vx:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; RV32ZVBC32-NEXT:    vmv.v.x v9, a0
+; RV32ZVBC32-NEXT:    li a0, 16
+; RV32ZVBC32-NEXT:    vand.vi v10, v9, 2
+; RV32ZVBC32-NEXT:    vand.vi v11, v9, 1
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v11, v10
+; RV32ZVBC32-NEXT:    vand.vi v11, v9, 4
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vi v11, v9, 8
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 32
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 64
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 128
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 256
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 512
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 1024
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 1
+; RV32ZVBC32-NEXT:    slli a0, a0, 11
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    lui a0, 1
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    lui a0, 2
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    lui a0, 4
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    lui a0, 8
+; RV32ZVBC32-NEXT:    vand.vx v9, v9, a0
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vmul.vv v8, v8, v9
+; RV32ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmul_nxv4i16_vx:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; RV64ZVBC32-NEXT:    vmv.v.x v9, a0
+; RV64ZVBC32-NEXT:    li a0, 16
+; RV64ZVBC32-NEXT:    vand.vi v10, v9, 2
+; RV64ZVBC32-NEXT:    vand.vi v11, v9, 1
+; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v11, v10
+; RV64ZVBC32-NEXT:    vand.vi v11, v9, 4
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vi v11, v9, 8
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 32
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 64
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 128
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 256
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 512
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 1024
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 1
+; RV64ZVBC32-NEXT:    slli a0, a0, 11
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 1
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 2
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 4
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 8
+; RV64ZVBC32-NEXT:    vand.vx v9, v9, a0
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vmul.vv v8, v8, v9
+; RV64ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x i16> poison, i16 %b, i32 0
   %vb = shufflevector <vscale x 4 x i16> %elt.head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
   %v = call <vscale x 4 x i16> @llvm.clmul.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb)
@@ -2132,29 +3440,159 @@ define <vscale x 8 x i16> @clmul_nxv8i16_vv(<vscale x 8 x i16> %va, <vscale x 8
 ; RV64V-NEXT:    vxor.vv v8, v12, v8
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmul_nxv8i16_vv:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV32ZVBC-NEXT:    vzext.vf4 v16, v10
-; RV32ZVBC-NEXT:    vzext.vf4 v24, v8
-; RV32ZVBC-NEXT:    vclmul.vv v8, v24, v16
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v16, v8, 0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v8, v16, 0
-; RV32ZVBC-NEXT:    ret
+; RV32ZVBC64-LABEL: clmul_nxv8i16_vv:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf4 v16, v10
+; RV32ZVBC64-NEXT:    vzext.vf4 v24, v8
+; RV32ZVBC64-NEXT:    vclmul.vv v8, v24, v16
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v16, v8, 0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v16, 0
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmul_nxv8i16_vv:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf4 v16, v10
+; RV64ZVBC64-NEXT:    vzext.vf4 v24, v8
+; RV64ZVBC64-NEXT:    vclmul.vv v8, v24, v16
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v16, v8, 0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v16, 0
+; RV64ZVBC64-NEXT:    ret
+;
+; RV32ZVBC32-LABEL: clmul_nxv8i16_vv:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; RV32ZVBC32-NEXT:    vand.vi v12, v10, 2
+; RV32ZVBC32-NEXT:    vand.vi v14, v10, 1
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v14, v12
+; RV32ZVBC32-NEXT:    vand.vi v14, v10, 4
+; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vi v14, v10, 8
+; RV32ZVBC32-NEXT:    li a0, 16
+; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV32ZVBC32-NEXT:    li a0, 32
+; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV32ZVBC32-NEXT:    li a0, 64
+; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV32ZVBC32-NEXT:    li a0, 128
+; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV32ZVBC32-NEXT:    li a0, 256
+; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV32ZVBC32-NEXT:    li a0, 512
+; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV32ZVBC32-NEXT:    li a0, 1024
+; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV32ZVBC32-NEXT:    li a0, 1
+; RV32ZVBC32-NEXT:    slli a0, a0, 11
+; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV32ZVBC32-NEXT:    lui a0, 1
+; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV32ZVBC32-NEXT:    lui a0, 2
+; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV32ZVBC32-NEXT:    lui a0, 4
+; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV32ZVBC32-NEXT:    lui a0, 8
+; RV32ZVBC32-NEXT:    vand.vx v10, v10, a0
+; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vmul.vv v8, v8, v10
+; RV32ZVBC32-NEXT:    vxor.vv v8, v12, v8
+; RV32ZVBC32-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmul_nxv8i16_vv:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV64ZVBC-NEXT:    vzext.vf4 v16, v10
-; RV64ZVBC-NEXT:    vzext.vf4 v24, v8
-; RV64ZVBC-NEXT:    vclmul.vv v8, v24, v16
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v16, v8, 0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v8, v16, 0
-; RV64ZVBC-NEXT:    ret
+; RV64ZVBC32-LABEL: clmul_nxv8i16_vv:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; RV64ZVBC32-NEXT:    vand.vi v12, v10, 2
+; RV64ZVBC32-NEXT:    vand.vi v14, v10, 1
+; RV64ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v14, v12
+; RV64ZVBC32-NEXT:    vand.vi v14, v10, 4
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vi v14, v10, 8
+; RV64ZVBC32-NEXT:    li a0, 16
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    li a0, 32
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    li a0, 64
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    li a0, 128
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    li a0, 256
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    li a0, 512
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    li a0, 1024
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    li a0, 1
+; RV64ZVBC32-NEXT:    slli a0, a0, 11
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    lui a0, 1
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    lui a0, 2
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    lui a0, 4
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    lui a0, 8
+; RV64ZVBC32-NEXT:    vand.vx v10, v10, a0
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vmul.vv v8, v8, v10
+; RV64ZVBC32-NEXT:    vxor.vv v8, v12, v8
+; RV64ZVBC32-NEXT:    ret
   %v = call <vscale x 8 x i16> @llvm.clmul.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %vb)
   ret <vscale x 8 x i16> %v
 }
@@ -2292,33 +3730,165 @@ define <vscale x 8 x i16> @clmul_nxv8i16_vx(<vscale x 8 x i16> %va, i16 %b) noun
 ; RV64V-NEXT:    vxor.vv v8, v12, v8
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmul_nxv8i16_vx:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; RV32ZVBC-NEXT:    vmv.v.x v24, a0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV32ZVBC-NEXT:    vzext.vf4 v16, v8
-; RV32ZVBC-NEXT:    vzext.vf4 v8, v24
-; RV32ZVBC-NEXT:    vclmul.vv v8, v16, v8
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v16, v8, 0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v8, v16, 0
-; RV32ZVBC-NEXT:    ret
+; RV32ZVBC64-LABEL: clmul_nxv8i16_vx:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; RV32ZVBC64-NEXT:    vmv.v.x v24, a0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf4 v16, v8
+; RV32ZVBC64-NEXT:    vzext.vf4 v8, v24
+; RV32ZVBC64-NEXT:    vclmul.vv v8, v16, v8
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v16, v8, 0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v16, 0
+; RV32ZVBC64-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmul_nxv8i16_vx:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; RV64ZVBC-NEXT:    vmv.v.x v24, a0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV64ZVBC-NEXT:    vzext.vf4 v16, v8
-; RV64ZVBC-NEXT:    vzext.vf4 v8, v24
-; RV64ZVBC-NEXT:    vclmul.vv v8, v16, v8
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v16, v8, 0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v8, v16, 0
-; RV64ZVBC-NEXT:    ret
+; RV64ZVBC64-LABEL: clmul_nxv8i16_vx:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; RV64ZVBC64-NEXT:    vmv.v.x v24, a0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf4 v16, v8
+; RV64ZVBC64-NEXT:    vzext.vf4 v8, v24
+; RV64ZVBC64-NEXT:    vclmul.vv v8, v16, v8
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v16, v8, 0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v16, 0
+; RV64ZVBC64-NEXT:    ret
+;
+; RV32ZVBC32-LABEL: clmul_nxv8i16_vx:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; RV32ZVBC32-NEXT:    vmv.v.x v10, a0
+; RV32ZVBC32-NEXT:    li a0, 16
+; RV32ZVBC32-NEXT:    vand.vi v12, v10, 2
+; RV32ZVBC32-NEXT:    vand.vi v14, v10, 1
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v14, v12
+; RV32ZVBC32-NEXT:    vand.vi v14, v10, 4
+; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vi v14, v10, 8
+; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV32ZVBC32-NEXT:    li a0, 32
+; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV32ZVBC32-NEXT:    li a0, 64
+; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV32ZVBC32-NEXT:    li a0, 128
+; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV32ZVBC32-NEXT:    li a0, 256
+; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV32ZVBC32-NEXT:    li a0, 512
+; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV32ZVBC32-NEXT:    li a0, 1024
+; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV32ZVBC32-NEXT:    li a0, 1
+; RV32ZVBC32-NEXT:    slli a0, a0, 11
+; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV32ZVBC32-NEXT:    lui a0, 1
+; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV32ZVBC32-NEXT:    lui a0, 2
+; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV32ZVBC32-NEXT:    lui a0, 4
+; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV32ZVBC32-NEXT:    lui a0, 8
+; RV32ZVBC32-NEXT:    vand.vx v10, v10, a0
+; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vmul.vv v8, v8, v10
+; RV32ZVBC32-NEXT:    vxor.vv v8, v12, v8
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmul_nxv8i16_vx:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; RV64ZVBC32-NEXT:    vmv.v.x v10, a0
+; RV64ZVBC32-NEXT:    li a0, 16
+; RV64ZVBC32-NEXT:    vand.vi v12, v10, 2
+; RV64ZVBC32-NEXT:    vand.vi v14, v10, 1
+; RV64ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v14, v12
+; RV64ZVBC32-NEXT:    vand.vi v14, v10, 4
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vi v14, v10, 8
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    li a0, 32
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    li a0, 64
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    li a0, 128
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    li a0, 256
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    li a0, 512
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    li a0, 1024
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    li a0, 1
+; RV64ZVBC32-NEXT:    slli a0, a0, 11
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    lui a0, 1
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    lui a0, 2
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    lui a0, 4
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    lui a0, 8
+; RV64ZVBC32-NEXT:    vand.vx v10, v10, a0
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vmul.vv v8, v8, v10
+; RV64ZVBC32-NEXT:    vxor.vv v8, v12, v8
+; RV64ZVBC32-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x i16> poison, i16 %b, i32 0
   %vb = shufflevector <vscale x 8 x i16> %elt.head, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
   %v = call <vscale x 8 x i16> @llvm.clmul.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %vb)
@@ -2866,25 +4436,283 @@ define <vscale x 1 x i32> @clmul_nxv1i32_vv(<vscale x 1 x i32> %va, <vscale x 1
 ; RV64V-NEXT:    vxor.vv v8, v10, v8
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmul_nxv1i32_vv:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV32ZVBC-NEXT:    vzext.vf2 v10, v9
-; RV32ZVBC-NEXT:    vzext.vf2 v9, v8
-; RV32ZVBC-NEXT:    vclmul.vv v8, v9, v10
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v8, v8, 0
-; RV32ZVBC-NEXT:    ret
+; RV32ZVBC64-LABEL: clmul_nxv1i32_vv:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf2 v10, v9
+; RV32ZVBC64-NEXT:    vzext.vf2 v9, v8
+; RV32ZVBC64-NEXT:    vclmul.vv v8, v9, v10
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v8, 0
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmul_nxv1i32_vv:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf2 v10, v9
+; RV64ZVBC64-NEXT:    vzext.vf2 v9, v8
+; RV64ZVBC64-NEXT:    vclmul.vv v8, v9, v10
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v8, 0
+; RV64ZVBC64-NEXT:    ret
+;
+; RV32ZVBC32-LABEL: clmul_nxv1i32_vv:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; RV32ZVBC32-NEXT:    vand.vi v10, v9, 2
+; RV32ZVBC32-NEXT:    vand.vi v11, v9, 1
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v11, v10
+; RV32ZVBC32-NEXT:    vand.vi v11, v9, 4
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vi v11, v9, 8
+; RV32ZVBC32-NEXT:    li a0, 16
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 32
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 64
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 128
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 256
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 512
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 1024
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 1
+; RV32ZVBC32-NEXT:    slli a0, a0, 11
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    lui a0, 1
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    lui a0, 2
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    lui a0, 4
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    lui a0, 8
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    lui a0, 16
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    lui a0, 32
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    lui a0, 64
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    lui a0, 128
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    lui a0, 256
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    lui a0, 512
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    lui a0, 1024
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    lui a0, 2048
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    lui a0, 4096
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    lui a0, 8192
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    lui a0, 16384
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    lui a0, 32768
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    lui a0, 65536
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    lui a0, 131072
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    lui a0, 262144
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    lui a0, 524288
+; RV32ZVBC32-NEXT:    vand.vx v9, v9, a0
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vmul.vv v8, v8, v9
+; RV32ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmul_nxv1i32_vv:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV64ZVBC-NEXT:    vzext.vf2 v10, v9
-; RV64ZVBC-NEXT:    vzext.vf2 v9, v8
-; RV64ZVBC-NEXT:    vclmul.vv v8, v9, v10
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v8, v8, 0
-; RV64ZVBC-NEXT:    ret
+; RV64ZVBC32-LABEL: clmul_nxv1i32_vv:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; RV64ZVBC32-NEXT:    vand.vi v10, v9, 2
+; RV64ZVBC32-NEXT:    vand.vi v11, v9, 1
+; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v11, v10
+; RV64ZVBC32-NEXT:    vand.vi v11, v9, 4
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vi v11, v9, 8
+; RV64ZVBC32-NEXT:    li a0, 16
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 32
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 64
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 128
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 256
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 512
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 1024
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 1
+; RV64ZVBC32-NEXT:    slli a0, a0, 11
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 1
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 2
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 4
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 8
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 16
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 32
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 64
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 128
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 256
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 512
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 1024
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 2048
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 4096
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 8192
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 16384
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 32768
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 65536
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 131072
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 262144
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 524288
+; RV64ZVBC32-NEXT:    vand.vx v9, v9, a0
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vmul.vv v8, v8, v9
+; RV64ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    ret
   %v = call <vscale x 1 x i32> @llvm.clmul.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %vb)
   ret <vscale x 1 x i32> %v
 }
@@ -3160,29 +4988,299 @@ define <vscale x 1 x i32> @clmul_nxv1i32_vx(<vscale x 1 x i32> %va, i32 %b) noun
 ; RV64V-NEXT:    vxor.vv v8, v10, v8
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmul_nxv1i32_vx:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
-; RV32ZVBC-NEXT:    vmv.v.x v9, a0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
-; RV32ZVBC-NEXT:    vzext.vf2 v10, v8
-; RV32ZVBC-NEXT:    vzext.vf2 v8, v9
-; RV32ZVBC-NEXT:    vclmul.vv v8, v10, v8
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v8, v8, 0
-; RV32ZVBC-NEXT:    ret
+; RV32ZVBC64-LABEL: clmul_nxv1i32_vx:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
+; RV32ZVBC64-NEXT:    vmv.v.x v9, a0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf2 v10, v8
+; RV32ZVBC64-NEXT:    vzext.vf2 v8, v9
+; RV32ZVBC64-NEXT:    vclmul.vv v8, v10, v8
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v8, 0
+; RV32ZVBC64-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmul_nxv1i32_vx:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
-; RV64ZVBC-NEXT:    vmv.v.x v9, a0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
-; RV64ZVBC-NEXT:    vzext.vf2 v10, v8
-; RV64ZVBC-NEXT:    vzext.vf2 v8, v9
-; RV64ZVBC-NEXT:    vclmul.vv v8, v10, v8
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v8, v8, 0
-; RV64ZVBC-NEXT:    ret
+; RV64ZVBC64-LABEL: clmul_nxv1i32_vx:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
+; RV64ZVBC64-NEXT:    vmv.v.x v9, a0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf2 v10, v8
+; RV64ZVBC64-NEXT:    vzext.vf2 v8, v9
+; RV64ZVBC64-NEXT:    vclmul.vv v8, v10, v8
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v8, 0
+; RV64ZVBC64-NEXT:    ret
+;
+; RV32ZVBC32-LABEL: clmul_nxv1i32_vx:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    addi sp, sp, -32
+; RV32ZVBC32-NEXT:    sw s0, 28(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s1, 24(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s2, 20(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s3, 16(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s4, 12(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s5, 8(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s6, 4(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s7, 0(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    andi s6, a0, 2
+; RV32ZVBC32-NEXT:    andi s5, a0, 1
+; RV32ZVBC32-NEXT:    andi s3, a0, 4
+; RV32ZVBC32-NEXT:    andi s1, a0, 8
+; RV32ZVBC32-NEXT:    andi t6, a0, 16
+; RV32ZVBC32-NEXT:    andi t4, a0, 32
+; RV32ZVBC32-NEXT:    andi t2, a0, 64
+; RV32ZVBC32-NEXT:    andi t0, a0, 128
+; RV32ZVBC32-NEXT:    andi a6, a0, 256
+; RV32ZVBC32-NEXT:    andi a4, a0, 512
+; RV32ZVBC32-NEXT:    andi a2, a0, 1024
+; RV32ZVBC32-NEXT:    li a1, 1
+; RV32ZVBC32-NEXT:    lui a3, 1
+; RV32ZVBC32-NEXT:    lui a5, 2
+; RV32ZVBC32-NEXT:    lui a7, 4
+; RV32ZVBC32-NEXT:    lui t1, 8
+; RV32ZVBC32-NEXT:    lui t3, 16
+; RV32ZVBC32-NEXT:    lui t5, 32
+; RV32ZVBC32-NEXT:    lui s0, 64
+; RV32ZVBC32-NEXT:    lui s2, 128
+; RV32ZVBC32-NEXT:    lui s4, 256
+; RV32ZVBC32-NEXT:    vsetvli s7, zero, e32, mf2, ta, ma
+; RV32ZVBC32-NEXT:    vmul.vx v9, v8, s6
+; RV32ZVBC32-NEXT:    lui s6, 512
+; RV32ZVBC32-NEXT:    vmul.vx v10, v8, s5
+; RV32ZVBC32-NEXT:    lui s5, 1024
+; RV32ZVBC32-NEXT:    vxor.vv v9, v10, v9
+; RV32ZVBC32-NEXT:    vmul.vx v10, v8, s3
+; RV32ZVBC32-NEXT:    lui s3, 2048
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vx v10, v8, s1
+; RV32ZVBC32-NEXT:    lui s1, 4096
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vx v10, v8, t6
+; RV32ZVBC32-NEXT:    lui t6, 8192
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vx v10, v8, t4
+; RV32ZVBC32-NEXT:    lui t4, 16384
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vx v10, v8, t2
+; RV32ZVBC32-NEXT:    lui t2, 32768
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vx v10, v8, t0
+; RV32ZVBC32-NEXT:    lui t0, 65536
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vx v10, v8, a6
+; RV32ZVBC32-NEXT:    lui a6, 131072
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vx v10, v8, a4
+; RV32ZVBC32-NEXT:    lui a4, 262144
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 524288
+; RV32ZVBC32-NEXT:    slli a1, a1, 11
+; RV32ZVBC32-NEXT:    and a3, a0, a3
+; RV32ZVBC32-NEXT:    and a5, a0, a5
+; RV32ZVBC32-NEXT:    and a7, a0, a7
+; RV32ZVBC32-NEXT:    and t1, a0, t1
+; RV32ZVBC32-NEXT:    and t3, a0, t3
+; RV32ZVBC32-NEXT:    and t5, a0, t5
+; RV32ZVBC32-NEXT:    and s0, a0, s0
+; RV32ZVBC32-NEXT:    and s2, a0, s2
+; RV32ZVBC32-NEXT:    and s4, a0, s4
+; RV32ZVBC32-NEXT:    and s6, a0, s6
+; RV32ZVBC32-NEXT:    and s5, a0, s5
+; RV32ZVBC32-NEXT:    and s3, a0, s3
+; RV32ZVBC32-NEXT:    and s1, a0, s1
+; RV32ZVBC32-NEXT:    and t6, a0, t6
+; RV32ZVBC32-NEXT:    and t4, a0, t4
+; RV32ZVBC32-NEXT:    and t2, a0, t2
+; RV32ZVBC32-NEXT:    and t0, a0, t0
+; RV32ZVBC32-NEXT:    and a6, a0, a6
+; RV32ZVBC32-NEXT:    and a4, a0, a4
+; RV32ZVBC32-NEXT:    and a2, a0, a2
+; RV32ZVBC32-NEXT:    and a0, a0, a1
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vx v10, v8, a0
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vx v10, v8, a3
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vx v10, v8, a5
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vx v10, v8, a7
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vx v10, v8, t1
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vx v10, v8, t3
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vx v10, v8, t5
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vx v10, v8, s0
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vx v10, v8, s2
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vx v10, v8, s4
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vx v10, v8, s6
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vx v10, v8, s5
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vx v10, v8, s3
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vx v10, v8, s1
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vx v10, v8, t6
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vx v10, v8, t4
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vx v10, v8, t2
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vx v10, v8, t0
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vx v10, v8, a6
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vx v10, v8, a4
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vx v8, v8, a2
+; RV32ZVBC32-NEXT:    vxor.vv v8, v9, v8
+; RV32ZVBC32-NEXT:    lw s0, 28(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s1, 24(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s2, 20(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s3, 16(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s4, 12(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s5, 8(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s6, 4(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s7, 0(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    addi sp, sp, 32
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmul_nxv1i32_vx:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
+; RV64ZVBC32-NEXT:    vmv.v.x v9, a0
+; RV64ZVBC32-NEXT:    li a0, 16
+; RV64ZVBC32-NEXT:    vand.vi v10, v9, 2
+; RV64ZVBC32-NEXT:    vand.vi v11, v9, 1
+; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v11, v10
+; RV64ZVBC32-NEXT:    vand.vi v11, v9, 4
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vi v11, v9, 8
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 32
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 64
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 128
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 256
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 512
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 1024
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 1
+; RV64ZVBC32-NEXT:    slli a0, a0, 11
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 1
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 2
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 4
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 8
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 16
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 32
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 64
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 128
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 256
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 512
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 1024
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 2048
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 4096
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 8192
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 16384
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 32768
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 65536
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 131072
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 262144
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 524288
+; RV64ZVBC32-NEXT:    vand.vx v9, v9, a0
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vmul.vv v8, v8, v9
+; RV64ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x i32> poison, i32 %b, i32 0
   %vb = shufflevector <vscale x 1 x i32> %elt.head, <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
   %v = call <vscale x 1 x i32> @llvm.clmul.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %vb)
@@ -3448,25 +5546,283 @@ define <vscale x 2 x i32> @clmul_nxv2i32_vv(<vscale x 2 x i32> %va, <vscale x 2
 ; RV64V-NEXT:    vxor.vv v8, v10, v8
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmul_nxv2i32_vv:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV32ZVBC-NEXT:    vzext.vf2 v10, v9
-; RV32ZVBC-NEXT:    vzext.vf2 v12, v8
-; RV32ZVBC-NEXT:    vclmul.vv v10, v12, v10
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v8, v10, 0
-; RV32ZVBC-NEXT:    ret
+; RV32ZVBC64-LABEL: clmul_nxv2i32_vv:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf2 v10, v9
+; RV32ZVBC64-NEXT:    vzext.vf2 v12, v8
+; RV32ZVBC64-NEXT:    vclmul.vv v10, v12, v10
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v10, 0
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmul_nxv2i32_vv:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf2 v10, v9
+; RV64ZVBC64-NEXT:    vzext.vf2 v12, v8
+; RV64ZVBC64-NEXT:    vclmul.vv v10, v12, v10
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v10, 0
+; RV64ZVBC64-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmul_nxv2i32_vv:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV64ZVBC-NEXT:    vzext.vf2 v10, v9
-; RV64ZVBC-NEXT:    vzext.vf2 v12, v8
-; RV64ZVBC-NEXT:    vclmul.vv v10, v12, v10
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v8, v10, 0
-; RV64ZVBC-NEXT:    ret
+; RV32ZVBC32-LABEL: clmul_nxv2i32_vv:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; RV32ZVBC32-NEXT:    vand.vi v10, v9, 2
+; RV32ZVBC32-NEXT:    vand.vi v11, v9, 1
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v11, v10
+; RV32ZVBC32-NEXT:    vand.vi v11, v9, 4
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vi v11, v9, 8
+; RV32ZVBC32-NEXT:    li a0, 16
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 32
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 64
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 128
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 256
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 512
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 1024
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    li a0, 1
+; RV32ZVBC32-NEXT:    slli a0, a0, 11
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    lui a0, 1
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    lui a0, 2
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    lui a0, 4
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    lui a0, 8
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    lui a0, 16
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    lui a0, 32
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    lui a0, 64
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    lui a0, 128
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    lui a0, 256
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    lui a0, 512
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    lui a0, 1024
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    lui a0, 2048
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    lui a0, 4096
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    lui a0, 8192
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    lui a0, 16384
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    lui a0, 32768
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    lui a0, 65536
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    lui a0, 131072
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    lui a0, 262144
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV32ZVBC32-NEXT:    lui a0, 524288
+; RV32ZVBC32-NEXT:    vand.vx v9, v9, a0
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vmul.vv v8, v8, v9
+; RV32ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmul_nxv2i32_vv:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; RV64ZVBC32-NEXT:    vand.vi v10, v9, 2
+; RV64ZVBC32-NEXT:    vand.vi v11, v9, 1
+; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v11, v10
+; RV64ZVBC32-NEXT:    vand.vi v11, v9, 4
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vi v11, v9, 8
+; RV64ZVBC32-NEXT:    li a0, 16
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 32
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 64
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 128
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 256
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 512
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 1024
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 1
+; RV64ZVBC32-NEXT:    slli a0, a0, 11
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 1
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 2
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 4
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 8
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 16
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 32
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 64
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 128
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 256
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 512
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 1024
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 2048
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 4096
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 8192
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 16384
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 32768
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 65536
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 131072
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 262144
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 524288
+; RV64ZVBC32-NEXT:    vand.vx v9, v9, a0
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vmul.vv v8, v8, v9
+; RV64ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    ret
   %v = call <vscale x 2 x i32> @llvm.clmul.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb)
   ret <vscale x 2 x i32> %v
 }
@@ -3742,29 +6098,299 @@ define <vscale x 2 x i32> @clmul_nxv2i32_vx(<vscale x 2 x i32> %va, i32 %b) noun
 ; RV64V-NEXT:    vxor.vv v8, v10, v8
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmul_nxv2i32_vx:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV32ZVBC-NEXT:    vmv.v.x v12, a0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; RV32ZVBC-NEXT:    vzext.vf2 v10, v8
-; RV32ZVBC-NEXT:    vzext.vf2 v8, v12
-; RV32ZVBC-NEXT:    vclmul.vv v10, v10, v8
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v8, v10, 0
-; RV32ZVBC-NEXT:    ret
+; RV32ZVBC64-LABEL: clmul_nxv2i32_vx:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; RV32ZVBC64-NEXT:    vmv.v.x v12, a0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf2 v10, v8
+; RV32ZVBC64-NEXT:    vzext.vf2 v8, v12
+; RV32ZVBC64-NEXT:    vclmul.vv v10, v10, v8
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v10, 0
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmul_nxv2i32_vx:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; RV64ZVBC64-NEXT:    vmv.v.x v12, a0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf2 v10, v8
+; RV64ZVBC64-NEXT:    vzext.vf2 v8, v12
+; RV64ZVBC64-NEXT:    vclmul.vv v10, v10, v8
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v10, 0
+; RV64ZVBC64-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmul_nxv2i32_vx:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV64ZVBC-NEXT:    vmv.v.x v12, a0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; RV64ZVBC-NEXT:    vzext.vf2 v10, v8
-; RV64ZVBC-NEXT:    vzext.vf2 v8, v12
-; RV64ZVBC-NEXT:    vclmul.vv v10, v10, v8
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v8, v10, 0
-; RV64ZVBC-NEXT:    ret
+; RV32ZVBC32-LABEL: clmul_nxv2i32_vx:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    addi sp, sp, -32
+; RV32ZVBC32-NEXT:    sw s0, 28(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s1, 24(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s2, 20(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s3, 16(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s4, 12(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s5, 8(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s6, 4(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s7, 0(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    andi s6, a0, 2
+; RV32ZVBC32-NEXT:    andi s5, a0, 1
+; RV32ZVBC32-NEXT:    andi s3, a0, 4
+; RV32ZVBC32-NEXT:    andi s1, a0, 8
+; RV32ZVBC32-NEXT:    andi t6, a0, 16
+; RV32ZVBC32-NEXT:    andi t4, a0, 32
+; RV32ZVBC32-NEXT:    andi t2, a0, 64
+; RV32ZVBC32-NEXT:    andi t0, a0, 128
+; RV32ZVBC32-NEXT:    andi a6, a0, 256
+; RV32ZVBC32-NEXT:    andi a4, a0, 512
+; RV32ZVBC32-NEXT:    andi a2, a0, 1024
+; RV32ZVBC32-NEXT:    li a1, 1
+; RV32ZVBC32-NEXT:    lui a3, 1
+; RV32ZVBC32-NEXT:    lui a5, 2
+; RV32ZVBC32-NEXT:    lui a7, 4
+; RV32ZVBC32-NEXT:    lui t1, 8
+; RV32ZVBC32-NEXT:    lui t3, 16
+; RV32ZVBC32-NEXT:    lui t5, 32
+; RV32ZVBC32-NEXT:    lui s0, 64
+; RV32ZVBC32-NEXT:    lui s2, 128
+; RV32ZVBC32-NEXT:    lui s4, 256
+; RV32ZVBC32-NEXT:    vsetvli s7, zero, e32, m1, ta, ma
+; RV32ZVBC32-NEXT:    vmul.vx v9, v8, s6
+; RV32ZVBC32-NEXT:    lui s6, 512
+; RV32ZVBC32-NEXT:    vmul.vx v10, v8, s5
+; RV32ZVBC32-NEXT:    lui s5, 1024
+; RV32ZVBC32-NEXT:    vxor.vv v9, v10, v9
+; RV32ZVBC32-NEXT:    vmul.vx v10, v8, s3
+; RV32ZVBC32-NEXT:    lui s3, 2048
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vx v10, v8, s1
+; RV32ZVBC32-NEXT:    lui s1, 4096
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vx v10, v8, t6
+; RV32ZVBC32-NEXT:    lui t6, 8192
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vx v10, v8, t4
+; RV32ZVBC32-NEXT:    lui t4, 16384
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vx v10, v8, t2
+; RV32ZVBC32-NEXT:    lui t2, 32768
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vx v10, v8, t0
+; RV32ZVBC32-NEXT:    lui t0, 65536
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vx v10, v8, a6
+; RV32ZVBC32-NEXT:    lui a6, 131072
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vx v10, v8, a4
+; RV32ZVBC32-NEXT:    lui a4, 262144
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 524288
+; RV32ZVBC32-NEXT:    slli a1, a1, 11
+; RV32ZVBC32-NEXT:    and a3, a0, a3
+; RV32ZVBC32-NEXT:    and a5, a0, a5
+; RV32ZVBC32-NEXT:    and a7, a0, a7
+; RV32ZVBC32-NEXT:    and t1, a0, t1
+; RV32ZVBC32-NEXT:    and t3, a0, t3
+; RV32ZVBC32-NEXT:    and t5, a0, t5
+; RV32ZVBC32-NEXT:    and s0, a0, s0
+; RV32ZVBC32-NEXT:    and s2, a0, s2
+; RV32ZVBC32-NEXT:    and s4, a0, s4
+; RV32ZVBC32-NEXT:    and s6, a0, s6
+; RV32ZVBC32-NEXT:    and s5, a0, s5
+; RV32ZVBC32-NEXT:    and s3, a0, s3
+; RV32ZVBC32-NEXT:    and s1, a0, s1
+; RV32ZVBC32-NEXT:    and t6, a0, t6
+; RV32ZVBC32-NEXT:    and t4, a0, t4
+; RV32ZVBC32-NEXT:    and t2, a0, t2
+; RV32ZVBC32-NEXT:    and t0, a0, t0
+; RV32ZVBC32-NEXT:    and a6, a0, a6
+; RV32ZVBC32-NEXT:    and a4, a0, a4
+; RV32ZVBC32-NEXT:    and a2, a0, a2
+; RV32ZVBC32-NEXT:    and a0, a0, a1
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vx v10, v8, a0
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vx v10, v8, a3
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vx v10, v8, a5
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vx v10, v8, a7
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vx v10, v8, t1
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vx v10, v8, t3
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vx v10, v8, t5
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vx v10, v8, s0
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vx v10, v8, s2
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vx v10, v8, s4
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vx v10, v8, s6
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vx v10, v8, s5
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vx v10, v8, s3
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vx v10, v8, s1
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vx v10, v8, t6
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vx v10, v8, t4
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vx v10, v8, t2
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vx v10, v8, t0
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vx v10, v8, a6
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vx v10, v8, a4
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vx v8, v8, a2
+; RV32ZVBC32-NEXT:    vxor.vv v8, v9, v8
+; RV32ZVBC32-NEXT:    lw s0, 28(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s1, 24(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s2, 20(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s3, 16(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s4, 12(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s5, 8(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s6, 4(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s7, 0(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    addi sp, sp, 32
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmul_nxv2i32_vx:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; RV64ZVBC32-NEXT:    vmv.v.x v9, a0
+; RV64ZVBC32-NEXT:    li a0, 16
+; RV64ZVBC32-NEXT:    vand.vi v10, v9, 2
+; RV64ZVBC32-NEXT:    vand.vi v11, v9, 1
+; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v11, v10
+; RV64ZVBC32-NEXT:    vand.vi v11, v9, 4
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vi v11, v9, 8
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 32
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 64
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 128
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 256
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 512
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 1024
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 1
+; RV64ZVBC32-NEXT:    slli a0, a0, 11
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 1
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 2
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 4
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 8
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 16
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 32
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 64
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 128
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 256
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 512
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 1024
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 2048
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 4096
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 8192
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 16384
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 32768
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 65536
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 131072
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 262144
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    lui a0, 524288
+; RV64ZVBC32-NEXT:    vand.vx v9, v9, a0
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vmul.vv v8, v8, v9
+; RV64ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x i32> poison, i32 %b, i32 0
   %vb = shufflevector <vscale x 2 x i32> %elt.head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
   %v = call <vscale x 2 x i32> @llvm.clmul.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb)
@@ -4030,25 +6656,283 @@ define <vscale x 4 x i32> @clmul_nxv4i32_vv(<vscale x 4 x i32> %va, <vscale x 4
 ; RV64V-NEXT:    vxor.vv v8, v12, v8
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmul_nxv4i32_vv:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV32ZVBC-NEXT:    vzext.vf2 v12, v10
-; RV32ZVBC-NEXT:    vzext.vf2 v16, v8
-; RV32ZVBC-NEXT:    vclmul.vv v12, v16, v12
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v8, v12, 0
-; RV32ZVBC-NEXT:    ret
+; RV32ZVBC64-LABEL: clmul_nxv4i32_vv:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf2 v12, v10
+; RV32ZVBC64-NEXT:    vzext.vf2 v16, v8
+; RV32ZVBC64-NEXT:    vclmul.vv v12, v16, v12
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v12, 0
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmul_nxv4i32_vv:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf2 v12, v10
+; RV64ZVBC64-NEXT:    vzext.vf2 v16, v8
+; RV64ZVBC64-NEXT:    vclmul.vv v12, v16, v12
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v12, 0
+; RV64ZVBC64-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmul_nxv4i32_vv:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV64ZVBC-NEXT:    vzext.vf2 v12, v10
-; RV64ZVBC-NEXT:    vzext.vf2 v16, v8
-; RV64ZVBC-NEXT:    vclmul.vv v12, v16, v12
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v8, v12, 0
-; RV64ZVBC-NEXT:    ret
+; RV32ZVBC32-LABEL: clmul_nxv4i32_vv:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; RV32ZVBC32-NEXT:    vand.vi v12, v10, 2
+; RV32ZVBC32-NEXT:    vand.vi v14, v10, 1
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v14, v12
+; RV32ZVBC32-NEXT:    vand.vi v14, v10, 4
+; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vi v14, v10, 8
+; RV32ZVBC32-NEXT:    li a0, 16
+; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV32ZVBC32-NEXT:    li a0, 32
+; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV32ZVBC32-NEXT:    li a0, 64
+; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV32ZVBC32-NEXT:    li a0, 128
+; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV32ZVBC32-NEXT:    li a0, 256
+; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV32ZVBC32-NEXT:    li a0, 512
+; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV32ZVBC32-NEXT:    li a0, 1024
+; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV32ZVBC32-NEXT:    li a0, 1
+; RV32ZVBC32-NEXT:    slli a0, a0, 11
+; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV32ZVBC32-NEXT:    lui a0, 1
+; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV32ZVBC32-NEXT:    lui a0, 2
+; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV32ZVBC32-NEXT:    lui a0, 4
+; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV32ZVBC32-NEXT:    lui a0, 8
+; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV32ZVBC32-NEXT:    lui a0, 16
+; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV32ZVBC32-NEXT:    lui a0, 32
+; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV32ZVBC32-NEXT:    lui a0, 64
+; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV32ZVBC32-NEXT:    lui a0, 128
+; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV32ZVBC32-NEXT:    lui a0, 256
+; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV32ZVBC32-NEXT:    lui a0, 512
+; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV32ZVBC32-NEXT:    lui a0, 1024
+; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV32ZVBC32-NEXT:    lui a0, 2048
+; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV32ZVBC32-NEXT:    lui a0, 4096
+; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV32ZVBC32-NEXT:    lui a0, 8192
+; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV32ZVBC32-NEXT:    lui a0, 16384
+; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV32ZVBC32-NEXT:    lui a0, 32768
+; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV32ZVBC32-NEXT:    lui a0, 65536
+; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV32ZVBC32-NEXT:    lui a0, 131072
+; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV32ZVBC32-NEXT:    lui a0, 262144
+; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV32ZVBC32-NEXT:    lui a0, 524288
+; RV32ZVBC32-NEXT:    vand.vx v10, v10, a0
+; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vmul.vv v8, v8, v10
+; RV32ZVBC32-NEXT:    vxor.vv v8, v12, v8
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmul_nxv4i32_vv:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; RV64ZVBC32-NEXT:    vand.vi v12, v10, 2
+; RV64ZVBC32-NEXT:    vand.vi v14, v10, 1
+; RV64ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v14, v12
+; RV64ZVBC32-NEXT:    vand.vi v14, v10, 4
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vi v14, v10, 8
+; RV64ZVBC32-NEXT:    li a0, 16
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    li a0, 32
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    li a0, 64
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    li a0, 128
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    li a0, 256
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    li a0, 512
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    li a0, 1024
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    li a0, 1
+; RV64ZVBC32-NEXT:    slli a0, a0, 11
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    lui a0, 1
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    lui a0, 2
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    lui a0, 4
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    lui a0, 8
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    lui a0, 16
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    lui a0, 32
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    lui a0, 64
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    lui a0, 128
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    lui a0, 256
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    lui a0, 512
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    lui a0, 1024
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    lui a0, 2048
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    lui a0, 4096
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    lui a0, 8192
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    lui a0, 16384
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    lui a0, 32768
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    lui a0, 65536
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    lui a0, 131072
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    lui a0, 262144
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    lui a0, 524288
+; RV64ZVBC32-NEXT:    vand.vx v10, v10, a0
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vmul.vv v8, v8, v10
+; RV64ZVBC32-NEXT:    vxor.vv v8, v12, v8
+; RV64ZVBC32-NEXT:    ret
   %v = call <vscale x 4 x i32> @llvm.clmul.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %vb)
   ret <vscale x 4 x i32> %v
 }
@@ -4324,29 +7208,299 @@ define <vscale x 4 x i32> @clmul_nxv4i32_vx(<vscale x 4 x i32> %va, i32 %b) noun
 ; RV64V-NEXT:    vxor.vv v8, v12, v8
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmul_nxv4i32_vx:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV32ZVBC-NEXT:    vmv.v.x v16, a0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
-; RV32ZVBC-NEXT:    vzext.vf2 v12, v8
-; RV32ZVBC-NEXT:    vzext.vf2 v8, v16
-; RV32ZVBC-NEXT:    vclmul.vv v12, v12, v8
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v8, v12, 0
-; RV32ZVBC-NEXT:    ret
+; RV32ZVBC64-LABEL: clmul_nxv4i32_vx:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; RV32ZVBC64-NEXT:    vmv.v.x v16, a0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf2 v12, v8
+; RV32ZVBC64-NEXT:    vzext.vf2 v8, v16
+; RV32ZVBC64-NEXT:    vclmul.vv v12, v12, v8
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v12, 0
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmul_nxv4i32_vx:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; RV64ZVBC64-NEXT:    vmv.v.x v16, a0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf2 v12, v8
+; RV64ZVBC64-NEXT:    vzext.vf2 v8, v16
+; RV64ZVBC64-NEXT:    vclmul.vv v12, v12, v8
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v12, 0
+; RV64ZVBC64-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmul_nxv4i32_vx:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV64ZVBC-NEXT:    vmv.v.x v16, a0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
-; RV64ZVBC-NEXT:    vzext.vf2 v12, v8
-; RV64ZVBC-NEXT:    vzext.vf2 v8, v16
-; RV64ZVBC-NEXT:    vclmul.vv v12, v12, v8
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v8, v12, 0
-; RV64ZVBC-NEXT:    ret
+; RV32ZVBC32-LABEL: clmul_nxv4i32_vx:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    addi sp, sp, -32
+; RV32ZVBC32-NEXT:    sw s0, 28(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s1, 24(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s2, 20(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s3, 16(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s4, 12(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s5, 8(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s6, 4(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s7, 0(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    andi s6, a0, 2
+; RV32ZVBC32-NEXT:    andi s5, a0, 1
+; RV32ZVBC32-NEXT:    andi s3, a0, 4
+; RV32ZVBC32-NEXT:    andi s1, a0, 8
+; RV32ZVBC32-NEXT:    andi t6, a0, 16
+; RV32ZVBC32-NEXT:    andi t4, a0, 32
+; RV32ZVBC32-NEXT:    andi t2, a0, 64
+; RV32ZVBC32-NEXT:    andi t0, a0, 128
+; RV32ZVBC32-NEXT:    andi a6, a0, 256
+; RV32ZVBC32-NEXT:    andi a4, a0, 512
+; RV32ZVBC32-NEXT:    andi a2, a0, 1024
+; RV32ZVBC32-NEXT:    li a1, 1
+; RV32ZVBC32-NEXT:    lui a3, 1
+; RV32ZVBC32-NEXT:    lui a5, 2
+; RV32ZVBC32-NEXT:    lui a7, 4
+; RV32ZVBC32-NEXT:    lui t1, 8
+; RV32ZVBC32-NEXT:    lui t3, 16
+; RV32ZVBC32-NEXT:    lui t5, 32
+; RV32ZVBC32-NEXT:    lui s0, 64
+; RV32ZVBC32-NEXT:    lui s2, 128
+; RV32ZVBC32-NEXT:    lui s4, 256
+; RV32ZVBC32-NEXT:    vsetvli s7, zero, e32, m2, ta, ma
+; RV32ZVBC32-NEXT:    vmul.vx v10, v8, s6
+; RV32ZVBC32-NEXT:    lui s6, 512
+; RV32ZVBC32-NEXT:    vmul.vx v12, v8, s5
+; RV32ZVBC32-NEXT:    lui s5, 1024
+; RV32ZVBC32-NEXT:    vxor.vv v10, v12, v10
+; RV32ZVBC32-NEXT:    vmul.vx v12, v8, s3
+; RV32ZVBC32-NEXT:    lui s3, 2048
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    vmul.vx v12, v8, s1
+; RV32ZVBC32-NEXT:    lui s1, 4096
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    vmul.vx v12, v8, t6
+; RV32ZVBC32-NEXT:    lui t6, 8192
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    vmul.vx v12, v8, t4
+; RV32ZVBC32-NEXT:    lui t4, 16384
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    vmul.vx v12, v8, t2
+; RV32ZVBC32-NEXT:    lui t2, 32768
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    vmul.vx v12, v8, t0
+; RV32ZVBC32-NEXT:    lui t0, 65536
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    vmul.vx v12, v8, a6
+; RV32ZVBC32-NEXT:    lui a6, 131072
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    vmul.vx v12, v8, a4
+; RV32ZVBC32-NEXT:    lui a4, 262144
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    vmul.vx v12, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 524288
+; RV32ZVBC32-NEXT:    slli a1, a1, 11
+; RV32ZVBC32-NEXT:    and a3, a0, a3
+; RV32ZVBC32-NEXT:    and a5, a0, a5
+; RV32ZVBC32-NEXT:    and a7, a0, a7
+; RV32ZVBC32-NEXT:    and t1, a0, t1
+; RV32ZVBC32-NEXT:    and t3, a0, t3
+; RV32ZVBC32-NEXT:    and t5, a0, t5
+; RV32ZVBC32-NEXT:    and s0, a0, s0
+; RV32ZVBC32-NEXT:    and s2, a0, s2
+; RV32ZVBC32-NEXT:    and s4, a0, s4
+; RV32ZVBC32-NEXT:    and s6, a0, s6
+; RV32ZVBC32-NEXT:    and s5, a0, s5
+; RV32ZVBC32-NEXT:    and s3, a0, s3
+; RV32ZVBC32-NEXT:    and s1, a0, s1
+; RV32ZVBC32-NEXT:    and t6, a0, t6
+; RV32ZVBC32-NEXT:    and t4, a0, t4
+; RV32ZVBC32-NEXT:    and t2, a0, t2
+; RV32ZVBC32-NEXT:    and t0, a0, t0
+; RV32ZVBC32-NEXT:    and a6, a0, a6
+; RV32ZVBC32-NEXT:    and a4, a0, a4
+; RV32ZVBC32-NEXT:    and a2, a0, a2
+; RV32ZVBC32-NEXT:    and a0, a0, a1
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    vmul.vx v12, v8, a0
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    vmul.vx v12, v8, a3
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    vmul.vx v12, v8, a5
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    vmul.vx v12, v8, a7
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    vmul.vx v12, v8, t1
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    vmul.vx v12, v8, t3
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    vmul.vx v12, v8, t5
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    vmul.vx v12, v8, s0
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    vmul.vx v12, v8, s2
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    vmul.vx v12, v8, s4
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    vmul.vx v12, v8, s6
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    vmul.vx v12, v8, s5
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    vmul.vx v12, v8, s3
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    vmul.vx v12, v8, s1
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    vmul.vx v12, v8, t6
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    vmul.vx v12, v8, t4
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    vmul.vx v12, v8, t2
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    vmul.vx v12, v8, t0
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    vmul.vx v12, v8, a6
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    vmul.vx v12, v8, a4
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    vmul.vx v8, v8, a2
+; RV32ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    lw s0, 28(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s1, 24(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s2, 20(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s3, 16(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s4, 12(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s5, 8(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s6, 4(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s7, 0(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    addi sp, sp, 32
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmul_nxv4i32_vx:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; RV64ZVBC32-NEXT:    vmv.v.x v10, a0
+; RV64ZVBC32-NEXT:    li a0, 16
+; RV64ZVBC32-NEXT:    vand.vi v12, v10, 2
+; RV64ZVBC32-NEXT:    vand.vi v14, v10, 1
+; RV64ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v14, v12
+; RV64ZVBC32-NEXT:    vand.vi v14, v10, 4
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vi v14, v10, 8
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    li a0, 32
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    li a0, 64
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    li a0, 128
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    li a0, 256
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    li a0, 512
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    li a0, 1024
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    li a0, 1
+; RV64ZVBC32-NEXT:    slli a0, a0, 11
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    lui a0, 1
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    lui a0, 2
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    lui a0, 4
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    lui a0, 8
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    lui a0, 16
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    lui a0, 32
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    lui a0, 64
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    lui a0, 128
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    lui a0, 256
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    lui a0, 512
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    lui a0, 1024
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    lui a0, 2048
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    lui a0, 4096
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    lui a0, 8192
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    lui a0, 16384
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    lui a0, 32768
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    lui a0, 65536
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    lui a0, 131072
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    lui a0, 262144
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    lui a0, 524288
+; RV64ZVBC32-NEXT:    vand.vx v10, v10, a0
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vmul.vv v8, v8, v10
+; RV64ZVBC32-NEXT:    vxor.vv v8, v12, v8
+; RV64ZVBC32-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
   %vb = shufflevector <vscale x 4 x i32> %elt.head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
   %v = call <vscale x 4 x i32> @llvm.clmul.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %vb)
@@ -4612,25 +7766,283 @@ define <vscale x 8 x i32> @clmul_nxv8i32_vv(<vscale x 8 x i32> %va, <vscale x 8
 ; RV64V-NEXT:    vxor.vv v8, v16, v8
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmul_nxv8i32_vv:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV32ZVBC-NEXT:    vzext.vf2 v16, v12
-; RV32ZVBC-NEXT:    vzext.vf2 v24, v8
-; RV32ZVBC-NEXT:    vclmul.vv v16, v24, v16
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v8, v16, 0
-; RV32ZVBC-NEXT:    ret
+; RV32ZVBC64-LABEL: clmul_nxv8i32_vv:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf2 v16, v12
+; RV32ZVBC64-NEXT:    vzext.vf2 v24, v8
+; RV32ZVBC64-NEXT:    vclmul.vv v16, v24, v16
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v16, 0
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmul_nxv8i32_vv:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf2 v16, v12
+; RV64ZVBC64-NEXT:    vzext.vf2 v24, v8
+; RV64ZVBC64-NEXT:    vclmul.vv v16, v24, v16
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v16, 0
+; RV64ZVBC64-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmul_nxv8i32_vv:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV64ZVBC-NEXT:    vzext.vf2 v16, v12
-; RV64ZVBC-NEXT:    vzext.vf2 v24, v8
-; RV64ZVBC-NEXT:    vclmul.vv v16, v24, v16
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v8, v16, 0
-; RV64ZVBC-NEXT:    ret
+; RV32ZVBC32-LABEL: clmul_nxv8i32_vv:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; RV32ZVBC32-NEXT:    vand.vi v16, v12, 2
+; RV32ZVBC32-NEXT:    vand.vi v20, v12, 1
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC32-NEXT:    vxor.vv v16, v20, v16
+; RV32ZVBC32-NEXT:    vand.vi v20, v12, 4
+; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    vand.vi v20, v12, 8
+; RV32ZVBC32-NEXT:    li a0, 16
+; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV32ZVBC32-NEXT:    li a0, 32
+; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV32ZVBC32-NEXT:    li a0, 64
+; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV32ZVBC32-NEXT:    li a0, 128
+; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV32ZVBC32-NEXT:    li a0, 256
+; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV32ZVBC32-NEXT:    li a0, 512
+; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV32ZVBC32-NEXT:    li a0, 1024
+; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV32ZVBC32-NEXT:    li a0, 1
+; RV32ZVBC32-NEXT:    slli a0, a0, 11
+; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV32ZVBC32-NEXT:    lui a0, 1
+; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV32ZVBC32-NEXT:    lui a0, 2
+; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV32ZVBC32-NEXT:    lui a0, 4
+; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV32ZVBC32-NEXT:    lui a0, 8
+; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV32ZVBC32-NEXT:    lui a0, 16
+; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV32ZVBC32-NEXT:    lui a0, 32
+; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV32ZVBC32-NEXT:    lui a0, 64
+; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV32ZVBC32-NEXT:    lui a0, 128
+; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV32ZVBC32-NEXT:    lui a0, 256
+; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV32ZVBC32-NEXT:    lui a0, 512
+; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV32ZVBC32-NEXT:    lui a0, 1024
+; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV32ZVBC32-NEXT:    lui a0, 2048
+; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV32ZVBC32-NEXT:    lui a0, 4096
+; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV32ZVBC32-NEXT:    lui a0, 8192
+; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV32ZVBC32-NEXT:    lui a0, 16384
+; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV32ZVBC32-NEXT:    lui a0, 32768
+; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV32ZVBC32-NEXT:    lui a0, 65536
+; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV32ZVBC32-NEXT:    lui a0, 131072
+; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV32ZVBC32-NEXT:    lui a0, 262144
+; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV32ZVBC32-NEXT:    lui a0, 524288
+; RV32ZVBC32-NEXT:    vand.vx v12, v12, a0
+; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    vmul.vv v8, v8, v12
+; RV32ZVBC32-NEXT:    vxor.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmul_nxv8i32_vv:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; RV64ZVBC32-NEXT:    vand.vi v16, v12, 2
+; RV64ZVBC32-NEXT:    vand.vi v20, v12, 1
+; RV64ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v20, v16
+; RV64ZVBC32-NEXT:    vand.vi v20, v12, 4
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vi v20, v12, 8
+; RV64ZVBC32-NEXT:    li a0, 16
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC32-NEXT:    li a0, 32
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC32-NEXT:    li a0, 64
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC32-NEXT:    li a0, 128
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC32-NEXT:    li a0, 256
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC32-NEXT:    li a0, 512
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC32-NEXT:    li a0, 1024
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC32-NEXT:    li a0, 1
+; RV64ZVBC32-NEXT:    slli a0, a0, 11
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC32-NEXT:    lui a0, 1
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC32-NEXT:    lui a0, 2
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC32-NEXT:    lui a0, 4
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC32-NEXT:    lui a0, 8
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC32-NEXT:    lui a0, 16
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC32-NEXT:    lui a0, 32
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC32-NEXT:    lui a0, 64
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC32-NEXT:    lui a0, 128
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC32-NEXT:    lui a0, 256
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC32-NEXT:    lui a0, 512
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC32-NEXT:    lui a0, 1024
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC32-NEXT:    lui a0, 2048
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC32-NEXT:    lui a0, 4096
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC32-NEXT:    lui a0, 8192
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC32-NEXT:    lui a0, 16384
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC32-NEXT:    lui a0, 32768
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC32-NEXT:    lui a0, 65536
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC32-NEXT:    lui a0, 131072
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC32-NEXT:    lui a0, 262144
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC32-NEXT:    lui a0, 524288
+; RV64ZVBC32-NEXT:    vand.vx v12, v12, a0
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vmul.vv v8, v8, v12
+; RV64ZVBC32-NEXT:    vxor.vv v8, v16, v8
+; RV64ZVBC32-NEXT:    ret
   %v = call <vscale x 8 x i32> @llvm.clmul.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %vb)
   ret <vscale x 8 x i32> %v
 }
@@ -4906,29 +8318,299 @@ define <vscale x 8 x i32> @clmul_nxv8i32_vx(<vscale x 8 x i32> %va, i32 %b) noun
 ; RV64V-NEXT:    vxor.vv v8, v16, v8
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmul_nxv8i32_vx:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV32ZVBC-NEXT:    vmv.v.x v24, a0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV32ZVBC-NEXT:    vzext.vf2 v16, v8
-; RV32ZVBC-NEXT:    vzext.vf2 v8, v24
-; RV32ZVBC-NEXT:    vclmul.vv v16, v16, v8
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v8, v16, 0
-; RV32ZVBC-NEXT:    ret
+; RV32ZVBC64-LABEL: clmul_nxv8i32_vx:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
+; RV32ZVBC64-NEXT:    vmv.v.x v24, a0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf2 v16, v8
+; RV32ZVBC64-NEXT:    vzext.vf2 v8, v24
+; RV32ZVBC64-NEXT:    vclmul.vv v16, v16, v8
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v16, 0
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmul_nxv8i32_vx:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
+; RV64ZVBC64-NEXT:    vmv.v.x v24, a0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf2 v16, v8
+; RV64ZVBC64-NEXT:    vzext.vf2 v8, v24
+; RV64ZVBC64-NEXT:    vclmul.vv v16, v16, v8
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v16, 0
+; RV64ZVBC64-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmul_nxv8i32_vx:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV64ZVBC-NEXT:    vmv.v.x v24, a0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV64ZVBC-NEXT:    vzext.vf2 v16, v8
-; RV64ZVBC-NEXT:    vzext.vf2 v8, v24
-; RV64ZVBC-NEXT:    vclmul.vv v16, v16, v8
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v8, v16, 0
-; RV64ZVBC-NEXT:    ret
+; RV32ZVBC32-LABEL: clmul_nxv8i32_vx:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    addi sp, sp, -32
+; RV32ZVBC32-NEXT:    sw s0, 28(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s1, 24(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s2, 20(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s3, 16(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s4, 12(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s5, 8(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s6, 4(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s7, 0(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    andi s6, a0, 2
+; RV32ZVBC32-NEXT:    andi s5, a0, 1
+; RV32ZVBC32-NEXT:    andi s3, a0, 4
+; RV32ZVBC32-NEXT:    andi s1, a0, 8
+; RV32ZVBC32-NEXT:    andi t6, a0, 16
+; RV32ZVBC32-NEXT:    andi t4, a0, 32
+; RV32ZVBC32-NEXT:    andi t2, a0, 64
+; RV32ZVBC32-NEXT:    andi t0, a0, 128
+; RV32ZVBC32-NEXT:    andi a6, a0, 256
+; RV32ZVBC32-NEXT:    andi a4, a0, 512
+; RV32ZVBC32-NEXT:    andi a2, a0, 1024
+; RV32ZVBC32-NEXT:    li a1, 1
+; RV32ZVBC32-NEXT:    lui a3, 1
+; RV32ZVBC32-NEXT:    lui a5, 2
+; RV32ZVBC32-NEXT:    lui a7, 4
+; RV32ZVBC32-NEXT:    lui t1, 8
+; RV32ZVBC32-NEXT:    lui t3, 16
+; RV32ZVBC32-NEXT:    lui t5, 32
+; RV32ZVBC32-NEXT:    lui s0, 64
+; RV32ZVBC32-NEXT:    lui s2, 128
+; RV32ZVBC32-NEXT:    lui s4, 256
+; RV32ZVBC32-NEXT:    vsetvli s7, zero, e32, m4, ta, ma
+; RV32ZVBC32-NEXT:    vmul.vx v12, v8, s6
+; RV32ZVBC32-NEXT:    lui s6, 512
+; RV32ZVBC32-NEXT:    vmul.vx v16, v8, s5
+; RV32ZVBC32-NEXT:    lui s5, 1024
+; RV32ZVBC32-NEXT:    vxor.vv v12, v16, v12
+; RV32ZVBC32-NEXT:    vmul.vx v16, v8, s3
+; RV32ZVBC32-NEXT:    lui s3, 2048
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    vmul.vx v16, v8, s1
+; RV32ZVBC32-NEXT:    lui s1, 4096
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    vmul.vx v16, v8, t6
+; RV32ZVBC32-NEXT:    lui t6, 8192
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    vmul.vx v16, v8, t4
+; RV32ZVBC32-NEXT:    lui t4, 16384
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    vmul.vx v16, v8, t2
+; RV32ZVBC32-NEXT:    lui t2, 32768
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    vmul.vx v16, v8, t0
+; RV32ZVBC32-NEXT:    lui t0, 65536
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    vmul.vx v16, v8, a6
+; RV32ZVBC32-NEXT:    lui a6, 131072
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    vmul.vx v16, v8, a4
+; RV32ZVBC32-NEXT:    lui a4, 262144
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    vmul.vx v16, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 524288
+; RV32ZVBC32-NEXT:    slli a1, a1, 11
+; RV32ZVBC32-NEXT:    and a3, a0, a3
+; RV32ZVBC32-NEXT:    and a5, a0, a5
+; RV32ZVBC32-NEXT:    and a7, a0, a7
+; RV32ZVBC32-NEXT:    and t1, a0, t1
+; RV32ZVBC32-NEXT:    and t3, a0, t3
+; RV32ZVBC32-NEXT:    and t5, a0, t5
+; RV32ZVBC32-NEXT:    and s0, a0, s0
+; RV32ZVBC32-NEXT:    and s2, a0, s2
+; RV32ZVBC32-NEXT:    and s4, a0, s4
+; RV32ZVBC32-NEXT:    and s6, a0, s6
+; RV32ZVBC32-NEXT:    and s5, a0, s5
+; RV32ZVBC32-NEXT:    and s3, a0, s3
+; RV32ZVBC32-NEXT:    and s1, a0, s1
+; RV32ZVBC32-NEXT:    and t6, a0, t6
+; RV32ZVBC32-NEXT:    and t4, a0, t4
+; RV32ZVBC32-NEXT:    and t2, a0, t2
+; RV32ZVBC32-NEXT:    and t0, a0, t0
+; RV32ZVBC32-NEXT:    and a6, a0, a6
+; RV32ZVBC32-NEXT:    and a4, a0, a4
+; RV32ZVBC32-NEXT:    and a2, a0, a2
+; RV32ZVBC32-NEXT:    and a0, a0, a1
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    vmul.vx v16, v8, a0
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    vmul.vx v16, v8, a3
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    vmul.vx v16, v8, a5
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    vmul.vx v16, v8, a7
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    vmul.vx v16, v8, t1
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    vmul.vx v16, v8, t3
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    vmul.vx v16, v8, t5
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    vmul.vx v16, v8, s0
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    vmul.vx v16, v8, s2
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    vmul.vx v16, v8, s4
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    vmul.vx v16, v8, s6
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    vmul.vx v16, v8, s5
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    vmul.vx v16, v8, s3
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    vmul.vx v16, v8, s1
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    vmul.vx v16, v8, t6
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    vmul.vx v16, v8, t4
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    vmul.vx v16, v8, t2
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    vmul.vx v16, v8, t0
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    vmul.vx v16, v8, a6
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    vmul.vx v16, v8, a4
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    vmul.vx v8, v8, a2
+; RV32ZVBC32-NEXT:    vxor.vv v8, v12, v8
+; RV32ZVBC32-NEXT:    lw s0, 28(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s1, 24(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s2, 20(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s3, 16(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s4, 12(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s5, 8(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s6, 4(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s7, 0(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    addi sp, sp, 32
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmul_nxv8i32_vx:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
+; RV64ZVBC32-NEXT:    vmv.v.x v12, a0
+; RV64ZVBC32-NEXT:    li a0, 16
+; RV64ZVBC32-NEXT:    vand.vi v16, v12, 2
+; RV64ZVBC32-NEXT:    vand.vi v20, v12, 1
+; RV64ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v20, v16
+; RV64ZVBC32-NEXT:    vand.vi v20, v12, 4
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vi v20, v12, 8
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC32-NEXT:    li a0, 32
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC32-NEXT:    li a0, 64
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC32-NEXT:    li a0, 128
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC32-NEXT:    li a0, 256
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC32-NEXT:    li a0, 512
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC32-NEXT:    li a0, 1024
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC32-NEXT:    li a0, 1
+; RV64ZVBC32-NEXT:    slli a0, a0, 11
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC32-NEXT:    lui a0, 1
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC32-NEXT:    lui a0, 2
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC32-NEXT:    lui a0, 4
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC32-NEXT:    lui a0, 8
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC32-NEXT:    lui a0, 16
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC32-NEXT:    lui a0, 32
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC32-NEXT:    lui a0, 64
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC32-NEXT:    lui a0, 128
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC32-NEXT:    lui a0, 256
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC32-NEXT:    lui a0, 512
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC32-NEXT:    lui a0, 1024
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC32-NEXT:    lui a0, 2048
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC32-NEXT:    lui a0, 4096
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC32-NEXT:    lui a0, 8192
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC32-NEXT:    lui a0, 16384
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC32-NEXT:    lui a0, 32768
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC32-NEXT:    lui a0, 65536
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC32-NEXT:    lui a0, 131072
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC32-NEXT:    lui a0, 262144
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC32-NEXT:    lui a0, 524288
+; RV64ZVBC32-NEXT:    vand.vx v12, v12, a0
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vmul.vv v8, v8, v12
+; RV64ZVBC32-NEXT:    vxor.vv v8, v16, v8
+; RV64ZVBC32-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x i32> poison, i32 %b, i32 0
   %vb = shufflevector <vscale x 8 x i32> %elt.head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
   %v = call <vscale x 8 x i32> @llvm.clmul.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %vb)
@@ -6131,17 +9813,803 @@ define <vscale x 1 x i64> @clmul_nxv1i64_vv(<vscale x 1 x i64> %va, <vscale x 1
 ; RV64V-NEXT:    vxor.vv v8, v10, v8
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmul_nxv1i64_vv:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV32ZVBC-NEXT:    vclmul.vv v8, v8, v9
-; RV32ZVBC-NEXT:    ret
+; RV32ZVBC64-LABEL: clmul_nxv1i64_vv:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV32ZVBC64-NEXT:    vclmul.vv v8, v8, v9
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmul_nxv1i64_vv:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV64ZVBC64-NEXT:    vclmul.vv v8, v8, v9
+; RV64ZVBC64-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmul_nxv1i64_vv:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV64ZVBC-NEXT:    vclmul.vv v8, v8, v9
-; RV64ZVBC-NEXT:    ret
+; RV32ZVBC32-LABEL: clmul_nxv1i64_vv:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    addi sp, sp, -352
+; RV32ZVBC32-NEXT:    sw ra, 348(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s0, 344(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s1, 340(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s2, 336(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s3, 332(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s4, 328(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s5, 324(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s6, 320(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s7, 316(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s8, 312(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s9, 308(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s10, 304(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s11, 300(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a1, a0, 3
+; RV32ZVBC32-NEXT:    sub a0, a1, a0
+; RV32ZVBC32-NEXT:    sub sp, sp, a0
+; RV32ZVBC32-NEXT:    lui a1, 524288
+; RV32ZVBC32-NEXT:    li t5, 1
+; RV32ZVBC32-NEXT:    li a4, 2
+; RV32ZVBC32-NEXT:    li a2, 4
+; RV32ZVBC32-NEXT:    li s11, 8
+; RV32ZVBC32-NEXT:    li a0, 16
+; RV32ZVBC32-NEXT:    li ra, 32
+; RV32ZVBC32-NEXT:    li s10, 64
+; RV32ZVBC32-NEXT:    li s9, 128
+; RV32ZVBC32-NEXT:    li s8, 256
+; RV32ZVBC32-NEXT:    li s7, 512
+; RV32ZVBC32-NEXT:    li s1, 1024
+; RV32ZVBC32-NEXT:    lui s6, 1
+; RV32ZVBC32-NEXT:    lui s5, 2
+; RV32ZVBC32-NEXT:    lui s4, 4
+; RV32ZVBC32-NEXT:    lui s3, 8
+; RV32ZVBC32-NEXT:    lui s2, 16
+; RV32ZVBC32-NEXT:    lui s0, 32
+; RV32ZVBC32-NEXT:    lui t6, 64
+; RV32ZVBC32-NEXT:    lui t4, 128
+; RV32ZVBC32-NEXT:    lui t3, 256
+; RV32ZVBC32-NEXT:    lui t2, 512
+; RV32ZVBC32-NEXT:    lui t1, 1024
+; RV32ZVBC32-NEXT:    lui t0, 2048
+; RV32ZVBC32-NEXT:    lui a7, 4096
+; RV32ZVBC32-NEXT:    lui a6, 8192
+; RV32ZVBC32-NEXT:    lui a5, 16384
+; RV32ZVBC32-NEXT:    lui a3, 32768
+; RV32ZVBC32-NEXT:    sw a1, 16(sp)
+; RV32ZVBC32-NEXT:    sw zero, 20(sp)
+; RV32ZVBC32-NEXT:    sw zero, 272(sp)
+; RV32ZVBC32-NEXT:    sw t5, 276(sp)
+; RV32ZVBC32-NEXT:    sw zero, 264(sp)
+; RV32ZVBC32-NEXT:    sw a4, 268(sp)
+; RV32ZVBC32-NEXT:    lui a4, 65536
+; RV32ZVBC32-NEXT:    sw zero, 256(sp)
+; RV32ZVBC32-NEXT:    sw a2, 260(sp)
+; RV32ZVBC32-NEXT:    lui a2, 131072
+; RV32ZVBC32-NEXT:    sw zero, 248(sp)
+; RV32ZVBC32-NEXT:    sw s11, 252(sp)
+; RV32ZVBC32-NEXT:    vsetvli s11, zero, e64, m1, ta, ma
+; RV32ZVBC32-NEXT:    vand.vi v13, v9, 2
+; RV32ZVBC32-NEXT:    vand.vi v14, v9, 1
+; RV32ZVBC32-NEXT:    vand.vi v12, v9, 4
+; RV32ZVBC32-NEXT:    vand.vi v11, v9, 8
+; RV32ZVBC32-NEXT:    sw zero, 240(sp)
+; RV32ZVBC32-NEXT:    sw a0, 244(sp)
+; RV32ZVBC32-NEXT:    vand.vx v10, v9, a0
+; RV32ZVBC32-NEXT:    addi s11, sp, 16
+; RV32ZVBC32-NEXT:    sw zero, 232(sp)
+; RV32ZVBC32-NEXT:    sw ra, 236(sp)
+; RV32ZVBC32-NEXT:    vand.vx v15, v9, ra
+; RV32ZVBC32-NEXT:    addi ra, sp, 272
+; RV32ZVBC32-NEXT:    sw zero, 224(sp)
+; RV32ZVBC32-NEXT:    sw s10, 228(sp)
+; RV32ZVBC32-NEXT:    vand.vx v16, v9, s10
+; RV32ZVBC32-NEXT:    addi s10, sp, 264
+; RV32ZVBC32-NEXT:    sw zero, 216(sp)
+; RV32ZVBC32-NEXT:    sw s9, 220(sp)
+; RV32ZVBC32-NEXT:    vand.vx v17, v9, s9
+; RV32ZVBC32-NEXT:    addi s9, sp, 256
+; RV32ZVBC32-NEXT:    sw zero, 208(sp)
+; RV32ZVBC32-NEXT:    sw s8, 212(sp)
+; RV32ZVBC32-NEXT:    vand.vx v18, v9, s8
+; RV32ZVBC32-NEXT:    addi s8, sp, 248
+; RV32ZVBC32-NEXT:    sw zero, 200(sp)
+; RV32ZVBC32-NEXT:    sw s7, 204(sp)
+; RV32ZVBC32-NEXT:    vand.vx v19, v9, s7
+; RV32ZVBC32-NEXT:    addi s7, sp, 240
+; RV32ZVBC32-NEXT:    sw zero, 192(sp)
+; RV32ZVBC32-NEXT:    sw s1, 196(sp)
+; RV32ZVBC32-NEXT:    vand.vx v20, v9, s1
+; RV32ZVBC32-NEXT:    slli t5, t5, 11
+; RV32ZVBC32-NEXT:    vand.vx v21, v9, s6
+; RV32ZVBC32-NEXT:    sw zero, 184(sp)
+; RV32ZVBC32-NEXT:    sw t5, 188(sp)
+; RV32ZVBC32-NEXT:    sw zero, 176(sp)
+; RV32ZVBC32-NEXT:    sw s6, 180(sp)
+; RV32ZVBC32-NEXT:    addi s6, sp, 224
+; RV32ZVBC32-NEXT:    vand.vx v22, v9, s5
+; RV32ZVBC32-NEXT:    sw zero, 168(sp)
+; RV32ZVBC32-NEXT:    sw s5, 172(sp)
+; RV32ZVBC32-NEXT:    addi s5, sp, 216
+; RV32ZVBC32-NEXT:    vand.vx v23, v9, s4
+; RV32ZVBC32-NEXT:    sw zero, 160(sp)
+; RV32ZVBC32-NEXT:    sw s4, 164(sp)
+; RV32ZVBC32-NEXT:    addi s4, sp, 208
+; RV32ZVBC32-NEXT:    vand.vx v24, v9, s3
+; RV32ZVBC32-NEXT:    sw zero, 152(sp)
+; RV32ZVBC32-NEXT:    sw s3, 156(sp)
+; RV32ZVBC32-NEXT:    addi s3, sp, 200
+; RV32ZVBC32-NEXT:    vand.vx v25, v9, s2
+; RV32ZVBC32-NEXT:    sw zero, 144(sp)
+; RV32ZVBC32-NEXT:    sw s2, 148(sp)
+; RV32ZVBC32-NEXT:    addi s2, sp, 192
+; RV32ZVBC32-NEXT:    vand.vx v26, v9, s0
+; RV32ZVBC32-NEXT:    sw zero, 136(sp)
+; RV32ZVBC32-NEXT:    sw s0, 140(sp)
+; RV32ZVBC32-NEXT:    addi s1, sp, 184
+; RV32ZVBC32-NEXT:    vand.vx v27, v9, t6
+; RV32ZVBC32-NEXT:    sw zero, 128(sp)
+; RV32ZVBC32-NEXT:    sw t6, 132(sp)
+; RV32ZVBC32-NEXT:    addi s0, sp, 176
+; RV32ZVBC32-NEXT:    vand.vx v28, v9, t4
+; RV32ZVBC32-NEXT:    sw zero, 120(sp)
+; RV32ZVBC32-NEXT:    sw t4, 124(sp)
+; RV32ZVBC32-NEXT:    addi t6, sp, 168
+; RV32ZVBC32-NEXT:    vand.vx v29, v9, t3
+; RV32ZVBC32-NEXT:    sw zero, 112(sp)
+; RV32ZVBC32-NEXT:    sw t3, 116(sp)
+; RV32ZVBC32-NEXT:    addi t4, sp, 160
+; RV32ZVBC32-NEXT:    vand.vx v30, v9, t2
+; RV32ZVBC32-NEXT:    sw zero, 104(sp)
+; RV32ZVBC32-NEXT:    sw t2, 108(sp)
+; RV32ZVBC32-NEXT:    addi t3, sp, 152
+; RV32ZVBC32-NEXT:    vand.vx v31, v9, t1
+; RV32ZVBC32-NEXT:    sw zero, 96(sp)
+; RV32ZVBC32-NEXT:    sw t1, 100(sp)
+; RV32ZVBC32-NEXT:    addi t2, sp, 144
+; RV32ZVBC32-NEXT:    vand.vx v7, v9, t0
+; RV32ZVBC32-NEXT:    sw zero, 88(sp)
+; RV32ZVBC32-NEXT:    sw t0, 92(sp)
+; RV32ZVBC32-NEXT:    addi t1, sp, 136
+; RV32ZVBC32-NEXT:    vand.vx v6, v9, a7
+; RV32ZVBC32-NEXT:    sw zero, 80(sp)
+; RV32ZVBC32-NEXT:    sw a7, 84(sp)
+; RV32ZVBC32-NEXT:    addi t0, sp, 128
+; RV32ZVBC32-NEXT:    vand.vx v5, v9, a6
+; RV32ZVBC32-NEXT:    sw zero, 72(sp)
+; RV32ZVBC32-NEXT:    sw a6, 76(sp)
+; RV32ZVBC32-NEXT:    addi a7, sp, 120
+; RV32ZVBC32-NEXT:    vand.vx v4, v9, a5
+; RV32ZVBC32-NEXT:    sw zero, 64(sp)
+; RV32ZVBC32-NEXT:    sw a5, 68(sp)
+; RV32ZVBC32-NEXT:    addi a6, sp, 112
+; RV32ZVBC32-NEXT:    vand.vx v3, v9, a3
+; RV32ZVBC32-NEXT:    sw zero, 56(sp)
+; RV32ZVBC32-NEXT:    sw a3, 60(sp)
+; RV32ZVBC32-NEXT:    addi a5, sp, 104
+; RV32ZVBC32-NEXT:    vand.vx v2, v9, a4
+; RV32ZVBC32-NEXT:    sw zero, 48(sp)
+; RV32ZVBC32-NEXT:    sw a4, 52(sp)
+; RV32ZVBC32-NEXT:    addi a4, sp, 96
+; RV32ZVBC32-NEXT:    vand.vx v1, v9, a2
+; RV32ZVBC32-NEXT:    sw zero, 40(sp)
+; RV32ZVBC32-NEXT:    sw a2, 44(sp)
+; RV32ZVBC32-NEXT:    addi a3, sp, 88
+; RV32ZVBC32-NEXT:    sw zero, 32(sp)
+; RV32ZVBC32-NEXT:    lui a0, 262144
+; RV32ZVBC32-NEXT:    sw a0, 36(sp)
+; RV32ZVBC32-NEXT:    sw zero, 24(sp)
+; RV32ZVBC32-NEXT:    sw a1, 28(sp)
+; RV32ZVBC32-NEXT:    addi a2, sp, 80
+; RV32ZVBC32-NEXT:    vand.vx v0, v9, t5
+; RV32ZVBC32-NEXT:    addi a1, sp, 72
+; RV32ZVBC32-NEXT:    vmul.vv v13, v8, v13
+; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC32-NEXT:    vxor.vv v14, v14, v13
+; RV32ZVBC32-NEXT:    vlse64.v v13, (s11), zero
+; RV32ZVBC32-NEXT:    addi s11, sp, 64
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    vxor.vv v14, v14, v12
+; RV32ZVBC32-NEXT:    vlse64.v v12, (ra), zero
+; RV32ZVBC32-NEXT:    csrr t5, vlenb
+; RV32ZVBC32-NEXT:    slli t5, t5, 1
+; RV32ZVBC32-NEXT:    mv ra, t5
+; RV32ZVBC32-NEXT:    slli t5, t5, 1
+; RV32ZVBC32-NEXT:    add t5, t5, ra
+; RV32ZVBC32-NEXT:    add t5, sp, t5
+; RV32ZVBC32-NEXT:    addi t5, t5, 288
+; RV32ZVBC32-NEXT:    vs1r.v v12, (t5) # vscale x 8-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi ra, sp, 56
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v14, v14, v11
+; RV32ZVBC32-NEXT:    vlse64.v v11, (s10), zero
+; RV32ZVBC32-NEXT:    csrr t5, vlenb
+; RV32ZVBC32-NEXT:    slli s10, t5, 2
+; RV32ZVBC32-NEXT:    add t5, s10, t5
+; RV32ZVBC32-NEXT:    add t5, sp, t5
+; RV32ZVBC32-NEXT:    addi t5, t5, 288
+; RV32ZVBC32-NEXT:    vs1r.v v11, (t5) # vscale x 8-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi s10, sp, 48
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV32ZVBC32-NEXT:    vxor.vv v14, v14, v10
+; RV32ZVBC32-NEXT:    vlse64.v v10, (s9), zero
+; RV32ZVBC32-NEXT:    csrr t5, vlenb
+; RV32ZVBC32-NEXT:    slli t5, t5, 2
+; RV32ZVBC32-NEXT:    add t5, sp, t5
+; RV32ZVBC32-NEXT:    addi t5, t5, 288
+; RV32ZVBC32-NEXT:    vs1r.v v10, (t5) # vscale x 8-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi t5, sp, 40
+; RV32ZVBC32-NEXT:    vmul.vv v15, v8, v15
+; RV32ZVBC32-NEXT:    vxor.vv v15, v14, v15
+; RV32ZVBC32-NEXT:    vlse64.v v10, (s8), zero
+; RV32ZVBC32-NEXT:    csrr s8, vlenb
+; RV32ZVBC32-NEXT:    slli s9, s8, 1
+; RV32ZVBC32-NEXT:    add s8, s9, s8
+; RV32ZVBC32-NEXT:    add s8, sp, s8
+; RV32ZVBC32-NEXT:    addi s8, s8, 288
+; RV32ZVBC32-NEXT:    vs1r.v v10, (s8) # vscale x 8-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi s8, sp, 32
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v16, v15, v16
+; RV32ZVBC32-NEXT:    vlse64.v v10, (s7), zero
+; RV32ZVBC32-NEXT:    csrr s7, vlenb
+; RV32ZVBC32-NEXT:    slli s7, s7, 1
+; RV32ZVBC32-NEXT:    add s7, sp, s7
+; RV32ZVBC32-NEXT:    addi s7, s7, 288
+; RV32ZVBC32-NEXT:    vs1r.v v10, (s7) # vscale x 8-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi s7, sp, 24
+; RV32ZVBC32-NEXT:    vmul.vv v17, v8, v17
+; RV32ZVBC32-NEXT:    vmul.vv v18, v8, v18
+; RV32ZVBC32-NEXT:    vmul.vv v19, v8, v19
+; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC32-NEXT:    vmul.vv v21, v8, v21
+; RV32ZVBC32-NEXT:    vmul.vv v22, v8, v22
+; RV32ZVBC32-NEXT:    vmul.vv v23, v8, v23
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vmul.vv v25, v8, v25
+; RV32ZVBC32-NEXT:    vmul.vv v26, v8, v26
+; RV32ZVBC32-NEXT:    vmul.vv v27, v8, v27
+; RV32ZVBC32-NEXT:    vmul.vv v28, v8, v28
+; RV32ZVBC32-NEXT:    vmul.vv v29, v8, v29
+; RV32ZVBC32-NEXT:    vmul.vv v30, v8, v30
+; RV32ZVBC32-NEXT:    vmul.vv v31, v8, v31
+; RV32ZVBC32-NEXT:    vmul.vv v7, v8, v7
+; RV32ZVBC32-NEXT:    vmul.vv v6, v8, v6
+; RV32ZVBC32-NEXT:    vmul.vv v5, v8, v5
+; RV32ZVBC32-NEXT:    vmul.vv v4, v8, v4
+; RV32ZVBC32-NEXT:    vmul.vv v3, v8, v3
+; RV32ZVBC32-NEXT:    vmul.vv v2, v8, v2
+; RV32ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v17
+; RV32ZVBC32-NEXT:    addi s9, sp, 232
+; RV32ZVBC32-NEXT:    vlse64.v v11, (s9), zero
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v18
+; RV32ZVBC32-NEXT:    vlse64.v v10, (s6), zero
+; RV32ZVBC32-NEXT:    csrr s6, vlenb
+; RV32ZVBC32-NEXT:    add s6, sp, s6
+; RV32ZVBC32-NEXT:    addi s6, s6, 288
+; RV32ZVBC32-NEXT:    vs1r.v v10, (s6) # vscale x 8-byte Folded Spill
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v19
+; RV32ZVBC32-NEXT:    vlse64.v v10, (s5), zero
+; RV32ZVBC32-NEXT:    addi s5, sp, 288
+; RV32ZVBC32-NEXT:    vs1r.v v10, (s5) # vscale x 8-byte Folded Spill
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    vlse64.v v12, (s4), zero
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v0
+; RV32ZVBC32-NEXT:    vlse64.v v0, (s3), zero
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v21
+; RV32ZVBC32-NEXT:    vlse64.v v21, (s2), zero
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v22
+; RV32ZVBC32-NEXT:    vlse64.v v22, (s1), zero
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v23
+; RV32ZVBC32-NEXT:    vlse64.v v23, (s0), zero
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    vlse64.v v24, (t6), zero
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v25
+; RV32ZVBC32-NEXT:    vlse64.v v25, (t4), zero
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v26
+; RV32ZVBC32-NEXT:    vlse64.v v26, (t3), zero
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v27
+; RV32ZVBC32-NEXT:    vlse64.v v27, (t2), zero
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v28
+; RV32ZVBC32-NEXT:    vlse64.v v28, (t1), zero
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v29
+; RV32ZVBC32-NEXT:    vlse64.v v29, (t0), zero
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v30
+; RV32ZVBC32-NEXT:    vlse64.v v30, (a7), zero
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v31
+; RV32ZVBC32-NEXT:    vlse64.v v31, (a6), zero
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v7
+; RV32ZVBC32-NEXT:    vlse64.v v7, (a5), zero
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v6
+; RV32ZVBC32-NEXT:    vlse64.v v6, (a4), zero
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v5
+; RV32ZVBC32-NEXT:    vlse64.v v5, (a3), zero
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v4
+; RV32ZVBC32-NEXT:    vlse64.v v4, (a2), zero
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v3
+; RV32ZVBC32-NEXT:    vlse64.v v3, (a1), zero
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v2
+; RV32ZVBC32-NEXT:    vlse64.v v2, (s11), zero
+; RV32ZVBC32-NEXT:    vxor.vv v1, v16, v1
+; RV32ZVBC32-NEXT:    vlse64.v v10, (ra), zero
+; RV32ZVBC32-NEXT:    vand.vv v13, v9, v13
+; RV32ZVBC32-NEXT:    csrr a1, vlenb
+; RV32ZVBC32-NEXT:    slli a1, a1, 1
+; RV32ZVBC32-NEXT:    mv a2, a1
+; RV32ZVBC32-NEXT:    slli a1, a1, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a2
+; RV32ZVBC32-NEXT:    add a1, sp, a1
+; RV32ZVBC32-NEXT:    addi a1, a1, 288
+; RV32ZVBC32-NEXT:    vl1r.v v14, (a1) # vscale x 8-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v14, v9, v14
+; RV32ZVBC32-NEXT:    csrr a1, vlenb
+; RV32ZVBC32-NEXT:    slli a2, a1, 2
+; RV32ZVBC32-NEXT:    add a1, a2, a1
+; RV32ZVBC32-NEXT:    add a1, sp, a1
+; RV32ZVBC32-NEXT:    addi a1, a1, 288
+; RV32ZVBC32-NEXT:    vl1r.v v15, (a1) # vscale x 8-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v15, v9, v15
+; RV32ZVBC32-NEXT:    csrr a1, vlenb
+; RV32ZVBC32-NEXT:    slli a1, a1, 2
+; RV32ZVBC32-NEXT:    add a1, sp, a1
+; RV32ZVBC32-NEXT:    addi a1, a1, 288
+; RV32ZVBC32-NEXT:    vl1r.v v16, (a1) # vscale x 8-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v16, v9, v16
+; RV32ZVBC32-NEXT:    csrr a1, vlenb
+; RV32ZVBC32-NEXT:    slli a2, a1, 1
+; RV32ZVBC32-NEXT:    add a1, a2, a1
+; RV32ZVBC32-NEXT:    add a1, sp, a1
+; RV32ZVBC32-NEXT:    addi a1, a1, 288
+; RV32ZVBC32-NEXT:    vl1r.v v17, (a1) # vscale x 8-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v17, v9, v17
+; RV32ZVBC32-NEXT:    csrr a1, vlenb
+; RV32ZVBC32-NEXT:    slli a1, a1, 1
+; RV32ZVBC32-NEXT:    add a1, sp, a1
+; RV32ZVBC32-NEXT:    addi a1, a1, 288
+; RV32ZVBC32-NEXT:    vl1r.v v18, (a1) # vscale x 8-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v18, v9, v18
+; RV32ZVBC32-NEXT:    vand.vv v19, v9, v11
+; RV32ZVBC32-NEXT:    csrr a1, vlenb
+; RV32ZVBC32-NEXT:    add a1, sp, a1
+; RV32ZVBC32-NEXT:    addi a1, a1, 288
+; RV32ZVBC32-NEXT:    vl1r.v v11, (a1) # vscale x 8-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v20, v9, v11
+; RV32ZVBC32-NEXT:    addi a1, sp, 288
+; RV32ZVBC32-NEXT:    vl1r.v v11, (a1) # vscale x 8-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v11, v9, v11
+; RV32ZVBC32-NEXT:    csrr a1, vlenb
+; RV32ZVBC32-NEXT:    slli a1, a1, 1
+; RV32ZVBC32-NEXT:    add a1, sp, a1
+; RV32ZVBC32-NEXT:    addi a1, a1, 288
+; RV32ZVBC32-NEXT:    vs1r.v v11, (a1) # vscale x 8-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v11, v9, v12
+; RV32ZVBC32-NEXT:    csrr a1, vlenb
+; RV32ZVBC32-NEXT:    slli a2, a1, 1
+; RV32ZVBC32-NEXT:    add a1, a2, a1
+; RV32ZVBC32-NEXT:    add a1, sp, a1
+; RV32ZVBC32-NEXT:    addi a1, a1, 288
+; RV32ZVBC32-NEXT:    vs1r.v v11, (a1) # vscale x 8-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v0, v9, v0
+; RV32ZVBC32-NEXT:    vand.vv v21, v9, v21
+; RV32ZVBC32-NEXT:    vand.vv v22, v9, v22
+; RV32ZVBC32-NEXT:    vand.vv v23, v9, v23
+; RV32ZVBC32-NEXT:    vand.vv v24, v9, v24
+; RV32ZVBC32-NEXT:    vand.vv v25, v9, v25
+; RV32ZVBC32-NEXT:    vand.vv v26, v9, v26
+; RV32ZVBC32-NEXT:    vand.vv v27, v9, v27
+; RV32ZVBC32-NEXT:    vand.vv v28, v9, v28
+; RV32ZVBC32-NEXT:    vand.vv v29, v9, v29
+; RV32ZVBC32-NEXT:    vand.vv v30, v9, v30
+; RV32ZVBC32-NEXT:    vand.vv v31, v9, v31
+; RV32ZVBC32-NEXT:    vand.vv v7, v9, v7
+; RV32ZVBC32-NEXT:    vand.vv v6, v9, v6
+; RV32ZVBC32-NEXT:    vand.vv v5, v9, v5
+; RV32ZVBC32-NEXT:    vand.vv v4, v9, v4
+; RV32ZVBC32-NEXT:    vand.vv v11, v9, v3
+; RV32ZVBC32-NEXT:    csrr a1, vlenb
+; RV32ZVBC32-NEXT:    slli a1, a1, 2
+; RV32ZVBC32-NEXT:    add a1, sp, a1
+; RV32ZVBC32-NEXT:    addi a1, a1, 288
+; RV32ZVBC32-NEXT:    vs1r.v v11, (a1) # vscale x 8-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v2, v9, v2
+; RV32ZVBC32-NEXT:    vand.vv v10, v9, v10
+; RV32ZVBC32-NEXT:    csrr a1, vlenb
+; RV32ZVBC32-NEXT:    slli a1, a1, 1
+; RV32ZVBC32-NEXT:    mv a2, a1
+; RV32ZVBC32-NEXT:    slli a1, a1, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a2
+; RV32ZVBC32-NEXT:    add a1, sp, a1
+; RV32ZVBC32-NEXT:    addi a1, a1, 288
+; RV32ZVBC32-NEXT:    vs1r.v v10, (a1) # vscale x 8-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v10, (s10), zero
+; RV32ZVBC32-NEXT:    vlse64.v v3, (t5), zero
+; RV32ZVBC32-NEXT:    vlse64.v v11, (s8), zero
+; RV32ZVBC32-NEXT:    vlse64.v v12, (s7), zero
+; RV32ZVBC32-NEXT:    vand.vv v10, v9, v10
+; RV32ZVBC32-NEXT:    vand.vv v3, v9, v3
+; RV32ZVBC32-NEXT:    vand.vv v11, v9, v11
+; RV32ZVBC32-NEXT:    csrr a1, vlenb
+; RV32ZVBC32-NEXT:    slli a2, a1, 2
+; RV32ZVBC32-NEXT:    add a1, a2, a1
+; RV32ZVBC32-NEXT:    add a1, sp, a1
+; RV32ZVBC32-NEXT:    addi a1, a1, 288
+; RV32ZVBC32-NEXT:    vs1r.v v11, (a1) # vscale x 8-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v9, v12
+; RV32ZVBC32-NEXT:    vand.vx v9, v9, a0
+; RV32ZVBC32-NEXT:    vmul.vv v9, v8, v9
+; RV32ZVBC32-NEXT:    vxor.vv v9, v1, v9
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v13
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v14
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v15
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v17
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v18
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v19
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v20
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl1r.v v11, (a0) # vscale x 8-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a1, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a1, a0
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl1r.v v11, (a0) # vscale x 8-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v21
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v22
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v23
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v25
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v26
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v27
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v28
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v29
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v30
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v31
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v7
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v6
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v5
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v4
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl1r.v v11, (a0) # vscale x 8-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v2
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl1r.v v11, (a0) # vscale x 8-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v3
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a1, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a1, a0
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vv v8, v8, v12
+; RV32ZVBC32-NEXT:    vxor.vv v8, v9, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a1, a0, 3
+; RV32ZVBC32-NEXT:    sub a0, a1, a0
+; RV32ZVBC32-NEXT:    add sp, sp, a0
+; RV32ZVBC32-NEXT:    lw ra, 348(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s0, 344(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s1, 340(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s2, 336(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s3, 332(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s4, 328(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s5, 324(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s6, 320(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s7, 316(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s8, 312(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s9, 308(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s10, 304(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s11, 300(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    addi sp, sp, 352
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmul_nxv1i64_vv:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV64ZVBC32-NEXT:    vand.vi v10, v9, 2
+; RV64ZVBC32-NEXT:    vand.vi v11, v9, 1
+; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v11, v10
+; RV64ZVBC32-NEXT:    vand.vi v11, v9, 4
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vi v11, v9, 8
+; RV64ZVBC32-NEXT:    li a0, 16
+; RV64ZVBC32-NEXT:    li a1, 32
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 64
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    li a1, 128
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 256
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    li a1, 512
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a2, 1024
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    li a0, 1
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a2
+; RV64ZVBC32-NEXT:    slli a1, a0, 11
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    lui a1, 1
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    lui a1, 2
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    lui a1, 4
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    lui a1, 8
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    lui a1, 16
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    lui a1, 32
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    lui a1, 64
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    lui a1, 128
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    lui a1, 256
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    lui a1, 512
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    lui a1, 1024
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    lui a1, 2048
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    lui a1, 4096
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    lui a1, 8192
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    lui a1, 16384
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    lui a1, 32768
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    lui a1, 65536
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    lui a1, 131072
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    lui a1, 262144
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 31
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 32
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 33
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 34
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 35
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 36
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 37
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 38
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 39
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 40
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 41
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 42
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 43
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 44
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 45
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 46
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 47
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 48
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 49
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 50
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 51
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 52
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 53
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 54
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 55
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 56
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 57
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 58
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 59
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 60
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 61
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    li a1, -1
+; RV64ZVBC32-NEXT:    slli a0, a0, 62
+; RV64ZVBC32-NEXT:    slli a1, a1, 63
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    vand.vx v9, v9, a1
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vmul.vv v8, v8, v9
+; RV64ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    ret
   %v = call <vscale x 1 x i64> @llvm.clmul.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %vb)
   ret <vscale x 1 x i64> %v
 }
@@ -6870,23 +11338,746 @@ define <vscale x 1 x i64> @clmul_nxv1i64_vx(<vscale x 1 x i64> %va, i64 %b) noun
 ; RV64V-NEXT:    vxor.vv v8, v9, v8
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmul_nxv1i64_vx:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    addi sp, sp, -16
-; RV32ZVBC-NEXT:    sw a0, 8(sp)
-; RV32ZVBC-NEXT:    sw a1, 12(sp)
-; RV32ZVBC-NEXT:    addi a0, sp, 8
-; RV32ZVBC-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32ZVBC-NEXT:    vlse64.v v9, (a0), zero
-; RV32ZVBC-NEXT:    vclmul.vv v8, v8, v9
-; RV32ZVBC-NEXT:    addi sp, sp, 16
-; RV32ZVBC-NEXT:    ret
+; RV32ZVBC64-LABEL: clmul_nxv1i64_vx:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    addi sp, sp, -16
+; RV32ZVBC64-NEXT:    sw a0, 8(sp)
+; RV32ZVBC64-NEXT:    sw a1, 12(sp)
+; RV32ZVBC64-NEXT:    addi a0, sp, 8
+; RV32ZVBC64-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32ZVBC64-NEXT:    vlse64.v v9, (a0), zero
+; RV32ZVBC64-NEXT:    vclmul.vv v8, v8, v9
+; RV32ZVBC64-NEXT:    addi sp, sp, 16
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmul_nxv1i64_vx:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV64ZVBC64-NEXT:    vclmul.vx v8, v8, a0
+; RV64ZVBC64-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmul_nxv1i64_vx:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV64ZVBC-NEXT:    vclmul.vx v8, v8, a0
-; RV64ZVBC-NEXT:    ret
+; RV32ZVBC32-LABEL: clmul_nxv1i64_vx:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    addi sp, sp, -352
+; RV32ZVBC32-NEXT:    sw ra, 348(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s0, 344(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s1, 340(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s2, 336(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s3, 332(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s4, 328(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s5, 324(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s6, 320(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s7, 316(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s8, 312(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s9, 308(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s10, 304(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s11, 300(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a2, vlenb
+; RV32ZVBC32-NEXT:    slli a3, a2, 2
+; RV32ZVBC32-NEXT:    add a2, a3, a2
+; RV32ZVBC32-NEXT:    sub sp, sp, a2
+; RV32ZVBC32-NEXT:    sw a0, 8(sp)
+; RV32ZVBC32-NEXT:    sw a1, 12(sp)
+; RV32ZVBC32-NEXT:    addi s4, sp, 8
+; RV32ZVBC32-NEXT:    lui s2, 524288
+; RV32ZVBC32-NEXT:    li s11, 1
+; RV32ZVBC32-NEXT:    li s6, 2
+; RV32ZVBC32-NEXT:    li s8, 4
+; RV32ZVBC32-NEXT:    li s10, 8
+; RV32ZVBC32-NEXT:    li ra, 64
+; RV32ZVBC32-NEXT:    li s9, 128
+; RV32ZVBC32-NEXT:    li s7, 256
+; RV32ZVBC32-NEXT:    li s5, 512
+; RV32ZVBC32-NEXT:    li s3, 1024
+; RV32ZVBC32-NEXT:    lui s1, 1
+; RV32ZVBC32-NEXT:    lui s0, 2
+; RV32ZVBC32-NEXT:    lui t6, 4
+; RV32ZVBC32-NEXT:    lui t5, 8
+; RV32ZVBC32-NEXT:    lui t4, 16
+; RV32ZVBC32-NEXT:    lui t3, 32
+; RV32ZVBC32-NEXT:    lui t2, 64
+; RV32ZVBC32-NEXT:    lui t1, 128
+; RV32ZVBC32-NEXT:    lui t0, 256
+; RV32ZVBC32-NEXT:    lui a6, 512
+; RV32ZVBC32-NEXT:    lui a5, 1024
+; RV32ZVBC32-NEXT:    lui a4, 2048
+; RV32ZVBC32-NEXT:    lui a3, 4096
+; RV32ZVBC32-NEXT:    lui a2, 8192
+; RV32ZVBC32-NEXT:    lui a0, 16384
+; RV32ZVBC32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32ZVBC32-NEXT:    vlse64.v v9, (s4), zero
+; RV32ZVBC32-NEXT:    lui s4, 32768
+; RV32ZVBC32-NEXT:    sw s2, 16(sp)
+; RV32ZVBC32-NEXT:    lui a7, 524288
+; RV32ZVBC32-NEXT:    sw zero, 20(sp)
+; RV32ZVBC32-NEXT:    sw zero, 272(sp)
+; RV32ZVBC32-NEXT:    sw s11, 276(sp)
+; RV32ZVBC32-NEXT:    sw zero, 264(sp)
+; RV32ZVBC32-NEXT:    sw s6, 268(sp)
+; RV32ZVBC32-NEXT:    lui s6, 65536
+; RV32ZVBC32-NEXT:    sw zero, 256(sp)
+; RV32ZVBC32-NEXT:    sw s8, 260(sp)
+; RV32ZVBC32-NEXT:    lui s8, 131072
+; RV32ZVBC32-NEXT:    sw zero, 248(sp)
+; RV32ZVBC32-NEXT:    sw s10, 252(sp)
+; RV32ZVBC32-NEXT:    lui a1, 262144
+; RV32ZVBC32-NEXT:    sw zero, 240(sp)
+; RV32ZVBC32-NEXT:    li s2, 16
+; RV32ZVBC32-NEXT:    sw s2, 244(sp)
+; RV32ZVBC32-NEXT:    li s10, 16
+; RV32ZVBC32-NEXT:    sw zero, 232(sp)
+; RV32ZVBC32-NEXT:    li s2, 32
+; RV32ZVBC32-NEXT:    sw s2, 236(sp)
+; RV32ZVBC32-NEXT:    sw zero, 224(sp)
+; RV32ZVBC32-NEXT:    sw ra, 228(sp)
+; RV32ZVBC32-NEXT:    sw zero, 216(sp)
+; RV32ZVBC32-NEXT:    sw s9, 220(sp)
+; RV32ZVBC32-NEXT:    li s2, 128
+; RV32ZVBC32-NEXT:    sw zero, 208(sp)
+; RV32ZVBC32-NEXT:    sw s7, 212(sp)
+; RV32ZVBC32-NEXT:    sw zero, 200(sp)
+; RV32ZVBC32-NEXT:    sw s5, 204(sp)
+; RV32ZVBC32-NEXT:    sw zero, 192(sp)
+; RV32ZVBC32-NEXT:    sw s3, 196(sp)
+; RV32ZVBC32-NEXT:    slli s11, s11, 11
+; RV32ZVBC32-NEXT:    sw zero, 184(sp)
+; RV32ZVBC32-NEXT:    sw s11, 188(sp)
+; RV32ZVBC32-NEXT:    sw zero, 176(sp)
+; RV32ZVBC32-NEXT:    sw s1, 180(sp)
+; RV32ZVBC32-NEXT:    lui s3, 1
+; RV32ZVBC32-NEXT:    sw zero, 168(sp)
+; RV32ZVBC32-NEXT:    sw s0, 172(sp)
+; RV32ZVBC32-NEXT:    lui s1, 2
+; RV32ZVBC32-NEXT:    sw zero, 160(sp)
+; RV32ZVBC32-NEXT:    sw t6, 164(sp)
+; RV32ZVBC32-NEXT:    lui s0, 4
+; RV32ZVBC32-NEXT:    sw zero, 152(sp)
+; RV32ZVBC32-NEXT:    sw t5, 156(sp)
+; RV32ZVBC32-NEXT:    sw zero, 144(sp)
+; RV32ZVBC32-NEXT:    sw t4, 148(sp)
+; RV32ZVBC32-NEXT:    lui t6, 16
+; RV32ZVBC32-NEXT:    sw zero, 136(sp)
+; RV32ZVBC32-NEXT:    sw t3, 140(sp)
+; RV32ZVBC32-NEXT:    lui t4, 32
+; RV32ZVBC32-NEXT:    sw zero, 128(sp)
+; RV32ZVBC32-NEXT:    sw t2, 132(sp)
+; RV32ZVBC32-NEXT:    lui t3, 64
+; RV32ZVBC32-NEXT:    sw zero, 120(sp)
+; RV32ZVBC32-NEXT:    sw t1, 124(sp)
+; RV32ZVBC32-NEXT:    lui t2, 128
+; RV32ZVBC32-NEXT:    sw zero, 112(sp)
+; RV32ZVBC32-NEXT:    sw t0, 116(sp)
+; RV32ZVBC32-NEXT:    sw zero, 104(sp)
+; RV32ZVBC32-NEXT:    sw a6, 108(sp)
+; RV32ZVBC32-NEXT:    sw zero, 96(sp)
+; RV32ZVBC32-NEXT:    sw a5, 100(sp)
+; RV32ZVBC32-NEXT:    sw zero, 88(sp)
+; RV32ZVBC32-NEXT:    sw a4, 92(sp)
+; RV32ZVBC32-NEXT:    lui t1, 2048
+; RV32ZVBC32-NEXT:    sw zero, 80(sp)
+; RV32ZVBC32-NEXT:    sw a3, 84(sp)
+; RV32ZVBC32-NEXT:    sw zero, 72(sp)
+; RV32ZVBC32-NEXT:    sw a2, 76(sp)
+; RV32ZVBC32-NEXT:    lui a4, 8192
+; RV32ZVBC32-NEXT:    sw zero, 64(sp)
+; RV32ZVBC32-NEXT:    sw a0, 68(sp)
+; RV32ZVBC32-NEXT:    lui a2, 16384
+; RV32ZVBC32-NEXT:    sw zero, 56(sp)
+; RV32ZVBC32-NEXT:    sw s4, 60(sp)
+; RV32ZVBC32-NEXT:    sw zero, 48(sp)
+; RV32ZVBC32-NEXT:    sw s6, 52(sp)
+; RV32ZVBC32-NEXT:    sw zero, 40(sp)
+; RV32ZVBC32-NEXT:    sw s8, 44(sp)
+; RV32ZVBC32-NEXT:    sw zero, 32(sp)
+; RV32ZVBC32-NEXT:    sw a1, 36(sp)
+; RV32ZVBC32-NEXT:    lui a0, 262144
+; RV32ZVBC32-NEXT:    sw zero, 24(sp)
+; RV32ZVBC32-NEXT:    sw a7, 28(sp)
+; RV32ZVBC32-NEXT:    addi a1, sp, 16
+; RV32ZVBC32-NEXT:    vlse64.v v11, (a1), zero
+; RV32ZVBC32-NEXT:    addi a1, sp, 272
+; RV32ZVBC32-NEXT:    vlse64.v v10, (a1), zero
+; RV32ZVBC32-NEXT:    addi a1, sp, 264
+; RV32ZVBC32-NEXT:    vlse64.v v13, (a1), zero
+; RV32ZVBC32-NEXT:    addi a1, sp, 256
+; RV32ZVBC32-NEXT:    vlse64.v v14, (a1), zero
+; RV32ZVBC32-NEXT:    addi a1, sp, 248
+; RV32ZVBC32-NEXT:    vlse64.v v15, (a1), zero
+; RV32ZVBC32-NEXT:    addi a1, sp, 240
+; RV32ZVBC32-NEXT:    vlse64.v v16, (a1), zero
+; RV32ZVBC32-NEXT:    addi a1, sp, 232
+; RV32ZVBC32-NEXT:    vlse64.v v17, (a1), zero
+; RV32ZVBC32-NEXT:    addi a1, sp, 224
+; RV32ZVBC32-NEXT:    vlse64.v v18, (a1), zero
+; RV32ZVBC32-NEXT:    addi a1, sp, 216
+; RV32ZVBC32-NEXT:    vlse64.v v19, (a1), zero
+; RV32ZVBC32-NEXT:    addi a1, sp, 208
+; RV32ZVBC32-NEXT:    vlse64.v v20, (a1), zero
+; RV32ZVBC32-NEXT:    addi a1, sp, 200
+; RV32ZVBC32-NEXT:    vlse64.v v21, (a1), zero
+; RV32ZVBC32-NEXT:    addi a1, sp, 192
+; RV32ZVBC32-NEXT:    vlse64.v v22, (a1), zero
+; RV32ZVBC32-NEXT:    addi a1, sp, 184
+; RV32ZVBC32-NEXT:    vlse64.v v23, (a1), zero
+; RV32ZVBC32-NEXT:    addi a1, sp, 176
+; RV32ZVBC32-NEXT:    vlse64.v v24, (a1), zero
+; RV32ZVBC32-NEXT:    addi a1, sp, 168
+; RV32ZVBC32-NEXT:    vlse64.v v25, (a1), zero
+; RV32ZVBC32-NEXT:    addi a1, sp, 160
+; RV32ZVBC32-NEXT:    vlse64.v v26, (a1), zero
+; RV32ZVBC32-NEXT:    addi a1, sp, 152
+; RV32ZVBC32-NEXT:    vlse64.v v27, (a1), zero
+; RV32ZVBC32-NEXT:    addi a1, sp, 144
+; RV32ZVBC32-NEXT:    vlse64.v v28, (a1), zero
+; RV32ZVBC32-NEXT:    addi a1, sp, 136
+; RV32ZVBC32-NEXT:    vlse64.v v29, (a1), zero
+; RV32ZVBC32-NEXT:    addi a1, sp, 128
+; RV32ZVBC32-NEXT:    vlse64.v v30, (a1), zero
+; RV32ZVBC32-NEXT:    addi a1, sp, 120
+; RV32ZVBC32-NEXT:    vlse64.v v31, (a1), zero
+; RV32ZVBC32-NEXT:    addi a1, sp, 112
+; RV32ZVBC32-NEXT:    vlse64.v v7, (a1), zero
+; RV32ZVBC32-NEXT:    addi a1, sp, 104
+; RV32ZVBC32-NEXT:    vlse64.v v6, (a1), zero
+; RV32ZVBC32-NEXT:    addi a1, sp, 96
+; RV32ZVBC32-NEXT:    vlse64.v v5, (a1), zero
+; RV32ZVBC32-NEXT:    addi a1, sp, 88
+; RV32ZVBC32-NEXT:    vlse64.v v3, (a1), zero
+; RV32ZVBC32-NEXT:    addi a1, sp, 80
+; RV32ZVBC32-NEXT:    vlse64.v v12, (a1), zero
+; RV32ZVBC32-NEXT:    csrr a1, vlenb
+; RV32ZVBC32-NEXT:    slli a1, a1, 2
+; RV32ZVBC32-NEXT:    add a1, sp, a1
+; RV32ZVBC32-NEXT:    addi a1, a1, 288
+; RV32ZVBC32-NEXT:    vs1r.v v12, (a1) # vscale x 8-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi a1, sp, 72
+; RV32ZVBC32-NEXT:    vlse64.v v4, (a1), zero
+; RV32ZVBC32-NEXT:    addi a1, sp, 64
+; RV32ZVBC32-NEXT:    vlse64.v v2, (a1), zero
+; RV32ZVBC32-NEXT:    addi ra, sp, 56
+; RV32ZVBC32-NEXT:    vand.vi v1, v9, 2
+; RV32ZVBC32-NEXT:    vand.vi v0, v9, 1
+; RV32ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v1, v0, v1
+; RV32ZVBC32-NEXT:    vand.vi v0, v9, 4
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v1, v1, v0
+; RV32ZVBC32-NEXT:    vand.vi v0, v9, 8
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v1, v1, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v9, s10
+; RV32ZVBC32-NEXT:    addi s10, sp, 48
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v1, v1, v0
+; RV32ZVBC32-NEXT:    li a1, 32
+; RV32ZVBC32-NEXT:    vand.vx v0, v9, a1
+; RV32ZVBC32-NEXT:    addi s9, sp, 40
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v1, v1, v0
+; RV32ZVBC32-NEXT:    li a1, 64
+; RV32ZVBC32-NEXT:    vand.vx v0, v9, a1
+; RV32ZVBC32-NEXT:    addi s7, sp, 32
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v1, v1, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v9, s2
+; RV32ZVBC32-NEXT:    addi s5, sp, 24
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v1, v1, v0
+; RV32ZVBC32-NEXT:    li a1, 256
+; RV32ZVBC32-NEXT:    vand.vx v0, v9, a1
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v1, v1, v0
+; RV32ZVBC32-NEXT:    li a1, 512
+; RV32ZVBC32-NEXT:    vand.vx v0, v9, a1
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v1, v1, v0
+; RV32ZVBC32-NEXT:    li a1, 1024
+; RV32ZVBC32-NEXT:    vand.vx v0, v9, a1
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v1, v1, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v9, s11
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v1, v1, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v9, s3
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v1, v1, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v9, s1
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v1, v1, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v9, s0
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v1, v1, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v9, t5
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v1, v1, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v9, t6
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v1, v1, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v9, t4
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v1, v1, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v9, t3
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v1, v1, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v9, t2
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v1, v1, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v9, t0
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v1, v1, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v9, a6
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v1, v1, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v9, a5
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v1, v1, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v9, t1
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v1, v1, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v9, a3
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v1, v1, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v9, a4
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v1, v1, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v9, a2
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v1, v1, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v9, s4
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v1, v1, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v9, s6
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v1, v1, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v9, s8
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v1, v1, v0
+; RV32ZVBC32-NEXT:    vlse64.v v0, (ra), zero
+; RV32ZVBC32-NEXT:    vand.vv v11, v9, v11
+; RV32ZVBC32-NEXT:    vand.vv v12, v9, v10
+; RV32ZVBC32-NEXT:    vand.vv v13, v9, v13
+; RV32ZVBC32-NEXT:    vand.vv v14, v9, v14
+; RV32ZVBC32-NEXT:    vand.vv v15, v9, v15
+; RV32ZVBC32-NEXT:    vand.vv v16, v9, v16
+; RV32ZVBC32-NEXT:    vand.vv v17, v9, v17
+; RV32ZVBC32-NEXT:    vand.vv v18, v9, v18
+; RV32ZVBC32-NEXT:    vand.vv v19, v9, v19
+; RV32ZVBC32-NEXT:    vand.vv v20, v9, v20
+; RV32ZVBC32-NEXT:    vand.vv v21, v9, v21
+; RV32ZVBC32-NEXT:    vand.vv v22, v9, v22
+; RV32ZVBC32-NEXT:    vand.vv v23, v9, v23
+; RV32ZVBC32-NEXT:    vand.vv v24, v9, v24
+; RV32ZVBC32-NEXT:    vand.vv v25, v9, v25
+; RV32ZVBC32-NEXT:    vand.vv v26, v9, v26
+; RV32ZVBC32-NEXT:    vand.vv v27, v9, v27
+; RV32ZVBC32-NEXT:    vand.vv v28, v9, v28
+; RV32ZVBC32-NEXT:    vand.vv v29, v9, v29
+; RV32ZVBC32-NEXT:    vand.vv v30, v9, v30
+; RV32ZVBC32-NEXT:    vand.vv v31, v9, v31
+; RV32ZVBC32-NEXT:    vand.vv v7, v9, v7
+; RV32ZVBC32-NEXT:    vand.vv v6, v9, v6
+; RV32ZVBC32-NEXT:    vand.vv v5, v9, v5
+; RV32ZVBC32-NEXT:    vand.vv v3, v9, v3
+; RV32ZVBC32-NEXT:    csrr a1, vlenb
+; RV32ZVBC32-NEXT:    slli a1, a1, 2
+; RV32ZVBC32-NEXT:    add a1, sp, a1
+; RV32ZVBC32-NEXT:    addi a1, a1, 288
+; RV32ZVBC32-NEXT:    vl1r.v v10, (a1) # vscale x 8-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v10, v9, v10
+; RV32ZVBC32-NEXT:    addi a1, sp, 288
+; RV32ZVBC32-NEXT:    vs1r.v v10, (a1) # vscale x 8-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v10, v9, v4
+; RV32ZVBC32-NEXT:    csrr a1, vlenb
+; RV32ZVBC32-NEXT:    add a1, sp, a1
+; RV32ZVBC32-NEXT:    addi a1, a1, 288
+; RV32ZVBC32-NEXT:    vs1r.v v10, (a1) # vscale x 8-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v10, v9, v2
+; RV32ZVBC32-NEXT:    csrr a1, vlenb
+; RV32ZVBC32-NEXT:    slli a1, a1, 1
+; RV32ZVBC32-NEXT:    add a1, sp, a1
+; RV32ZVBC32-NEXT:    addi a1, a1, 288
+; RV32ZVBC32-NEXT:    vs1r.v v10, (a1) # vscale x 8-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v10, v9, v0
+; RV32ZVBC32-NEXT:    csrr a1, vlenb
+; RV32ZVBC32-NEXT:    slli a1, a1, 2
+; RV32ZVBC32-NEXT:    add a1, sp, a1
+; RV32ZVBC32-NEXT:    addi a1, a1, 288
+; RV32ZVBC32-NEXT:    vs1r.v v10, (a1) # vscale x 8-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v4, (s10), zero
+; RV32ZVBC32-NEXT:    vlse64.v v2, (s9), zero
+; RV32ZVBC32-NEXT:    vlse64.v v0, (s7), zero
+; RV32ZVBC32-NEXT:    vlse64.v v10, (s5), zero
+; RV32ZVBC32-NEXT:    vand.vv v4, v9, v4
+; RV32ZVBC32-NEXT:    vand.vv v2, v9, v2
+; RV32ZVBC32-NEXT:    vand.vv v0, v9, v0
+; RV32ZVBC32-NEXT:    vand.vv v10, v9, v10
+; RV32ZVBC32-NEXT:    csrr a1, vlenb
+; RV32ZVBC32-NEXT:    slli a2, a1, 1
+; RV32ZVBC32-NEXT:    add a1, a2, a1
+; RV32ZVBC32-NEXT:    add a1, sp, a1
+; RV32ZVBC32-NEXT:    addi a1, a1, 288
+; RV32ZVBC32-NEXT:    vs1r.v v10, (a1) # vscale x 8-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v9, v9, a0
+; RV32ZVBC32-NEXT:    vmul.vv v9, v8, v9
+; RV32ZVBC32-NEXT:    vxor.vv v9, v1, v9
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v12
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v13
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v14
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v15
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v17
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v18
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v19
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v20
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v21
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v22
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v23
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v25
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v26
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v27
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v28
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v29
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v30
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v31
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v7
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v6
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v5
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v3
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    addi a0, sp, 288
+; RV32ZVBC32-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v4
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v2
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a1, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a1, a0
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v8, v10
+; RV32ZVBC32-NEXT:    vxor.vv v8, v9, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a1, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a1, a0
+; RV32ZVBC32-NEXT:    add sp, sp, a0
+; RV32ZVBC32-NEXT:    lw ra, 348(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s0, 344(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s1, 340(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s2, 336(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s3, 332(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s4, 328(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s5, 324(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s6, 320(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s7, 316(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s8, 312(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s9, 308(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s10, 304(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s11, 300(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    addi sp, sp, 352
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmul_nxv1i64_vx:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    andi a1, a0, 2
+; RV64ZVBC32-NEXT:    andi a2, a0, 1
+; RV64ZVBC32-NEXT:    vsetvli a3, zero, e64, m1, ta, ma
+; RV64ZVBC32-NEXT:    vmul.vx v9, v8, a1
+; RV64ZVBC32-NEXT:    andi a1, a0, 4
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    andi a2, a0, 8
+; RV64ZVBC32-NEXT:    vxor.vv v9, v10, v9
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a1
+; RV64ZVBC32-NEXT:    andi a1, a0, 16
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    andi a2, a0, 32
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a1
+; RV64ZVBC32-NEXT:    andi a1, a0, 64
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    andi a2, a0, 128
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a1
+; RV64ZVBC32-NEXT:    andi a1, a0, 256
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    andi a2, a0, 512
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a1
+; RV64ZVBC32-NEXT:    andi a3, a0, 1024
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    li a1, 1
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a3
+; RV64ZVBC32-NEXT:    slli a2, a1, 11
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 1
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 2
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 4
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 8
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 16
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 32
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 64
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 128
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 256
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 512
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 1024
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 2048
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 4096
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 8192
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 16384
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 32768
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 65536
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 131072
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 262144
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    srliw a2, a0, 31
+; RV64ZVBC32-NEXT:    slli a2, a2, 31
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 32
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 33
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 34
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 35
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 36
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 37
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 38
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 39
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 40
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 41
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 42
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 43
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 44
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 45
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 46
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 47
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 48
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 49
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 50
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 51
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 52
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 53
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 54
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 55
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 56
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 57
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 58
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 59
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 60
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 61
+; RV64ZVBC32-NEXT:    slli a1, a1, 62
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    and a1, a0, a1
+; RV64ZVBC32-NEXT:    srli a0, a0, 63
+; RV64ZVBC32-NEXT:    slli a0, a0, 63
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a1
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v8, v8, a0
+; RV64ZVBC32-NEXT:    vxor.vv v8, v9, v8
+; RV64ZVBC32-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x i64> poison, i64 %b, i32 0
   %vb = shufflevector <vscale x 1 x i64> %elt.head, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
   %v = call <vscale x 1 x i64> @llvm.clmul.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %vb)
@@ -8018,17 +13209,1141 @@ define <vscale x 2 x i64> @clmul_nxv2i64_vv(<vscale x 2 x i64> %va, <vscale x 2
 ; RV64V-NEXT:    vxor.vv v8, v12, v8
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmul_nxv2i64_vv:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV32ZVBC-NEXT:    vclmul.vv v8, v8, v10
-; RV32ZVBC-NEXT:    ret
+; RV32ZVBC64-LABEL: clmul_nxv2i64_vv:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; RV32ZVBC64-NEXT:    vclmul.vv v8, v8, v10
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmul_nxv2i64_vv:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; RV64ZVBC64-NEXT:    vclmul.vv v8, v8, v10
+; RV64ZVBC64-NEXT:    ret
+;
+; RV32ZVBC32-LABEL: clmul_nxv2i64_vv:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    addi sp, sp, -352
+; RV32ZVBC32-NEXT:    sw ra, 348(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s0, 344(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s1, 340(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s2, 336(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s3, 332(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s4, 328(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s5, 324(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s6, 320(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s7, 316(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s8, 312(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s9, 308(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s10, 304(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s11, 300(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    sub sp, sp, a0
+; RV32ZVBC32-NEXT:    lui a1, 524288
+; RV32ZVBC32-NEXT:    li s2, 1
+; RV32ZVBC32-NEXT:    li a3, 2
+; RV32ZVBC32-NEXT:    li a2, 4
+; RV32ZVBC32-NEXT:    li s7, 8
+; RV32ZVBC32-NEXT:    li a0, 16
+; RV32ZVBC32-NEXT:    li s6, 32
+; RV32ZVBC32-NEXT:    li s5, 64
+; RV32ZVBC32-NEXT:    li s3, 128
+; RV32ZVBC32-NEXT:    li s1, 256
+; RV32ZVBC32-NEXT:    li s0, 512
+; RV32ZVBC32-NEXT:    li t5, 1024
+; RV32ZVBC32-NEXT:    lui ra, 1
+; RV32ZVBC32-NEXT:    lui s8, 2
+; RV32ZVBC32-NEXT:    lui s10, 4
+; RV32ZVBC32-NEXT:    lui s11, 8
+; RV32ZVBC32-NEXT:    lui s9, 16
+; RV32ZVBC32-NEXT:    lui s4, 32
+; RV32ZVBC32-NEXT:    lui t6, 64
+; RV32ZVBC32-NEXT:    lui t4, 128
+; RV32ZVBC32-NEXT:    lui t3, 256
+; RV32ZVBC32-NEXT:    lui t2, 512
+; RV32ZVBC32-NEXT:    lui t1, 1024
+; RV32ZVBC32-NEXT:    lui t0, 2048
+; RV32ZVBC32-NEXT:    lui a7, 4096
+; RV32ZVBC32-NEXT:    lui a6, 8192
+; RV32ZVBC32-NEXT:    lui a5, 16384
+; RV32ZVBC32-NEXT:    lui a4, 32768
+; RV32ZVBC32-NEXT:    sw a1, 16(sp)
+; RV32ZVBC32-NEXT:    sw zero, 20(sp)
+; RV32ZVBC32-NEXT:    sw zero, 272(sp)
+; RV32ZVBC32-NEXT:    sw s2, 276(sp)
+; RV32ZVBC32-NEXT:    sw zero, 264(sp)
+; RV32ZVBC32-NEXT:    sw a3, 268(sp)
+; RV32ZVBC32-NEXT:    lui a3, 65536
+; RV32ZVBC32-NEXT:    sw zero, 256(sp)
+; RV32ZVBC32-NEXT:    sw a2, 260(sp)
+; RV32ZVBC32-NEXT:    lui a2, 131072
+; RV32ZVBC32-NEXT:    sw zero, 248(sp)
+; RV32ZVBC32-NEXT:    sw s7, 252(sp)
+; RV32ZVBC32-NEXT:    vsetvli s7, zero, e64, m2, ta, ma
+; RV32ZVBC32-NEXT:    vand.vi v24, v10, 2
+; RV32ZVBC32-NEXT:    vand.vi v20, v10, 1
+; RV32ZVBC32-NEXT:    vand.vi v26, v10, 4
+; RV32ZVBC32-NEXT:    vand.vi v14, v10, 8
+; RV32ZVBC32-NEXT:    sw zero, 240(sp)
+; RV32ZVBC32-NEXT:    sw a0, 244(sp)
+; RV32ZVBC32-NEXT:    vand.vx v12, v10, a0
+; RV32ZVBC32-NEXT:    addi s7, sp, 16
+; RV32ZVBC32-NEXT:    sw zero, 232(sp)
+; RV32ZVBC32-NEXT:    sw s6, 236(sp)
+; RV32ZVBC32-NEXT:    vand.vx v16, v10, s6
+; RV32ZVBC32-NEXT:    addi s6, sp, 272
+; RV32ZVBC32-NEXT:    sw zero, 224(sp)
+; RV32ZVBC32-NEXT:    sw s5, 228(sp)
+; RV32ZVBC32-NEXT:    vand.vx v18, v10, s5
+; RV32ZVBC32-NEXT:    addi s5, sp, 264
+; RV32ZVBC32-NEXT:    sw zero, 216(sp)
+; RV32ZVBC32-NEXT:    sw s3, 220(sp)
+; RV32ZVBC32-NEXT:    vand.vx v0, v10, s3
+; RV32ZVBC32-NEXT:    addi s3, sp, 256
+; RV32ZVBC32-NEXT:    sw zero, 208(sp)
+; RV32ZVBC32-NEXT:    sw s1, 212(sp)
+; RV32ZVBC32-NEXT:    vand.vx v6, v10, s1
+; RV32ZVBC32-NEXT:    addi s1, sp, 248
+; RV32ZVBC32-NEXT:    sw zero, 200(sp)
+; RV32ZVBC32-NEXT:    sw s0, 204(sp)
+; RV32ZVBC32-NEXT:    vand.vx v4, v10, s0
+; RV32ZVBC32-NEXT:    addi s0, sp, 240
+; RV32ZVBC32-NEXT:    sw zero, 192(sp)
+; RV32ZVBC32-NEXT:    sw t5, 196(sp)
+; RV32ZVBC32-NEXT:    vand.vx v2, v10, t5
+; RV32ZVBC32-NEXT:    slli s2, s2, 11
+; RV32ZVBC32-NEXT:    vand.vx v28, v10, ra
+; RV32ZVBC32-NEXT:    sw zero, 184(sp)
+; RV32ZVBC32-NEXT:    sw s2, 188(sp)
+; RV32ZVBC32-NEXT:    sw zero, 176(sp)
+; RV32ZVBC32-NEXT:    sw ra, 180(sp)
+; RV32ZVBC32-NEXT:    addi t5, sp, 224
+; RV32ZVBC32-NEXT:    vand.vx v30, v10, s8
+; RV32ZVBC32-NEXT:    sw zero, 168(sp)
+; RV32ZVBC32-NEXT:    sw s8, 172(sp)
+; RV32ZVBC32-NEXT:    addi s8, sp, 216
+; RV32ZVBC32-NEXT:    vand.vx v22, v10, s10
+; RV32ZVBC32-NEXT:    sw zero, 160(sp)
+; RV32ZVBC32-NEXT:    sw s10, 164(sp)
+; RV32ZVBC32-NEXT:    addi s10, sp, 208
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC32-NEXT:    vxor.vv v20, v20, v24
+; RV32ZVBC32-NEXT:    vand.vx v24, v10, s11
+; RV32ZVBC32-NEXT:    sw zero, 152(sp)
+; RV32ZVBC32-NEXT:    sw s11, 156(sp)
+; RV32ZVBC32-NEXT:    addi s11, sp, 200
+; RV32ZVBC32-NEXT:    vmul.vv v26, v8, v26
+; RV32ZVBC32-NEXT:    vxor.vv v20, v20, v26
+; RV32ZVBC32-NEXT:    vand.vx v26, v10, s9
+; RV32ZVBC32-NEXT:    sw zero, 144(sp)
+; RV32ZVBC32-NEXT:    sw s9, 148(sp)
+; RV32ZVBC32-NEXT:    addi s9, sp, 192
+; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC32-NEXT:    vxor.vv v14, v20, v14
+; RV32ZVBC32-NEXT:    vand.vx v20, v10, s4
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv ra, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, ra
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v20, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw zero, 136(sp)
+; RV32ZVBC32-NEXT:    sw s4, 140(sp)
+; RV32ZVBC32-NEXT:    addi s4, sp, 184
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    vxor.vv v12, v14, v12
+; RV32ZVBC32-NEXT:    vand.vx v14, v10, t6
+; RV32ZVBC32-NEXT:    sw zero, 128(sp)
+; RV32ZVBC32-NEXT:    sw t6, 132(sp)
+; RV32ZVBC32-NEXT:    addi t6, sp, 176
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    vand.vx v16, v10, t4
+; RV32ZVBC32-NEXT:    sw zero, 120(sp)
+; RV32ZVBC32-NEXT:    sw t4, 124(sp)
+; RV32ZVBC32-NEXT:    addi t4, sp, 168
+; RV32ZVBC32-NEXT:    vmul.vv v18, v8, v18
+; RV32ZVBC32-NEXT:    vxor.vv v18, v12, v18
+; RV32ZVBC32-NEXT:    vand.vx v12, v10, t3
+; RV32ZVBC32-NEXT:    sw zero, 112(sp)
+; RV32ZVBC32-NEXT:    sw t3, 116(sp)
+; RV32ZVBC32-NEXT:    addi t3, sp, 160
+; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v18, v18, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v10, t2
+; RV32ZVBC32-NEXT:    sw zero, 104(sp)
+; RV32ZVBC32-NEXT:    sw t2, 108(sp)
+; RV32ZVBC32-NEXT:    addi t2, sp, 152
+; RV32ZVBC32-NEXT:    vmul.vv v6, v8, v6
+; RV32ZVBC32-NEXT:    vxor.vv v18, v18, v6
+; RV32ZVBC32-NEXT:    vand.vx v6, v10, t1
+; RV32ZVBC32-NEXT:    sw zero, 96(sp)
+; RV32ZVBC32-NEXT:    sw t1, 100(sp)
+; RV32ZVBC32-NEXT:    addi t1, sp, 144
+; RV32ZVBC32-NEXT:    vmul.vv v4, v8, v4
+; RV32ZVBC32-NEXT:    vxor.vv v18, v18, v4
+; RV32ZVBC32-NEXT:    vand.vx v4, v10, t0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv ra, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add ra, ra, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, ra
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v4, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw zero, 88(sp)
+; RV32ZVBC32-NEXT:    sw t0, 92(sp)
+; RV32ZVBC32-NEXT:    addi t0, sp, 136
+; RV32ZVBC32-NEXT:    vmul.vv v2, v8, v2
+; RV32ZVBC32-NEXT:    vxor.vv v18, v18, v2
+; RV32ZVBC32-NEXT:    vand.vx v2, v10, s2
+; RV32ZVBC32-NEXT:    addi ra, sp, 128
+; RV32ZVBC32-NEXT:    vmul.vv v2, v8, v2
+; RV32ZVBC32-NEXT:    vxor.vv v18, v18, v2
+; RV32ZVBC32-NEXT:    vand.vx v2, v10, a7
+; RV32ZVBC32-NEXT:    sw zero, 80(sp)
+; RV32ZVBC32-NEXT:    sw a7, 84(sp)
+; RV32ZVBC32-NEXT:    addi a7, sp, 120
+; RV32ZVBC32-NEXT:    vmul.vv v28, v8, v28
+; RV32ZVBC32-NEXT:    vxor.vv v18, v18, v28
+; RV32ZVBC32-NEXT:    vand.vx v4, v10, a6
+; RV32ZVBC32-NEXT:    sw zero, 72(sp)
+; RV32ZVBC32-NEXT:    sw a6, 76(sp)
+; RV32ZVBC32-NEXT:    addi a6, sp, 112
+; RV32ZVBC32-NEXT:    vmul.vv v30, v8, v30
+; RV32ZVBC32-NEXT:    vxor.vv v18, v18, v30
+; RV32ZVBC32-NEXT:    vand.vx v30, v10, a5
+; RV32ZVBC32-NEXT:    sw zero, 64(sp)
+; RV32ZVBC32-NEXT:    sw a5, 68(sp)
+; RV32ZVBC32-NEXT:    addi a5, sp, 104
+; RV32ZVBC32-NEXT:    vmul.vv v22, v8, v22
+; RV32ZVBC32-NEXT:    vxor.vv v18, v18, v22
+; RV32ZVBC32-NEXT:    vand.vx v28, v10, a4
+; RV32ZVBC32-NEXT:    sw zero, 56(sp)
+; RV32ZVBC32-NEXT:    sw a4, 60(sp)
+; RV32ZVBC32-NEXT:    addi a4, sp, 96
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v18, v18, v24
+; RV32ZVBC32-NEXT:    vand.vx v24, v10, a3
+; RV32ZVBC32-NEXT:    sw zero, 48(sp)
+; RV32ZVBC32-NEXT:    sw a3, 52(sp)
+; RV32ZVBC32-NEXT:    addi a3, sp, 88
+; RV32ZVBC32-NEXT:    vmul.vv v26, v8, v26
+; RV32ZVBC32-NEXT:    vxor.vv v18, v18, v26
+; RV32ZVBC32-NEXT:    vand.vx v26, v10, a2
+; RV32ZVBC32-NEXT:    sw zero, 40(sp)
+; RV32ZVBC32-NEXT:    sw a2, 44(sp)
+; RV32ZVBC32-NEXT:    addi a2, sp, 80
+; RV32ZVBC32-NEXT:    sw zero, 32(sp)
+; RV32ZVBC32-NEXT:    lui a0, 262144
+; RV32ZVBC32-NEXT:    sw a0, 36(sp)
+; RV32ZVBC32-NEXT:    sw zero, 24(sp)
+; RV32ZVBC32-NEXT:    sw a1, 28(sp)
+; RV32ZVBC32-NEXT:    addi a1, sp, 72
+; RV32ZVBC32-NEXT:    sw a6, 4(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a6, vlenb
+; RV32ZVBC32-NEXT:    slli a6, a6, 3
+; RV32ZVBC32-NEXT:    mv s2, a6
+; RV32ZVBC32-NEXT:    slli a6, a6, 2
+; RV32ZVBC32-NEXT:    add a6, a6, s2
+; RV32ZVBC32-NEXT:    add a6, sp, a6
+; RV32ZVBC32-NEXT:    addi a6, a6, 288
+; RV32ZVBC32-NEXT:    vl2r.v v22, (a6) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v22
+; RV32ZVBC32-NEXT:    vxor.vv v0, v18, v0
+; RV32ZVBC32-NEXT:    vlse64.v v18, (s7), zero
+; RV32ZVBC32-NEXT:    csrr a6, vlenb
+; RV32ZVBC32-NEXT:    slli a6, a6, 3
+; RV32ZVBC32-NEXT:    mv s2, a6
+; RV32ZVBC32-NEXT:    slli a6, a6, 2
+; RV32ZVBC32-NEXT:    add a6, a6, s2
+; RV32ZVBC32-NEXT:    add a6, sp, a6
+; RV32ZVBC32-NEXT:    addi a6, a6, 288
+; RV32ZVBC32-NEXT:    vs2r.v v18, (a6) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi s7, sp, 64
+; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC32-NEXT:    vxor.vv v14, v0, v14
+; RV32ZVBC32-NEXT:    vlse64.v v18, (s6), zero
+; RV32ZVBC32-NEXT:    csrr a6, vlenb
+; RV32ZVBC32-NEXT:    slli a6, a6, 2
+; RV32ZVBC32-NEXT:    mv s2, a6
+; RV32ZVBC32-NEXT:    slli a6, a6, 3
+; RV32ZVBC32-NEXT:    add a6, a6, s2
+; RV32ZVBC32-NEXT:    add a6, sp, a6
+; RV32ZVBC32-NEXT:    addi a6, a6, 288
+; RV32ZVBC32-NEXT:    vs2r.v v18, (a6) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi s2, sp, 56
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v14, v14, v16
+; RV32ZVBC32-NEXT:    vlse64.v v16, (s5), zero
+; RV32ZVBC32-NEXT:    csrr a6, vlenb
+; RV32ZVBC32-NEXT:    slli a6, a6, 1
+; RV32ZVBC32-NEXT:    mv s5, a6
+; RV32ZVBC32-NEXT:    slli a6, a6, 4
+; RV32ZVBC32-NEXT:    add a6, a6, s5
+; RV32ZVBC32-NEXT:    add a6, sp, a6
+; RV32ZVBC32-NEXT:    addi a6, a6, 288
+; RV32ZVBC32-NEXT:    vs2r.v v16, (a6) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi s5, sp, 48
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    vxor.vv v12, v14, v12
+; RV32ZVBC32-NEXT:    vlse64.v v14, (s3), zero
+; RV32ZVBC32-NEXT:    csrr a6, vlenb
+; RV32ZVBC32-NEXT:    slli a6, a6, 5
+; RV32ZVBC32-NEXT:    add a6, sp, a6
+; RV32ZVBC32-NEXT:    addi a6, a6, 288
+; RV32ZVBC32-NEXT:    vs2r.v v14, (a6) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi s3, sp, 40
+; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC32-NEXT:    vxor.vv v20, v12, v20
+; RV32ZVBC32-NEXT:    vlse64.v v12, (s1), zero
+; RV32ZVBC32-NEXT:    csrr a6, vlenb
+; RV32ZVBC32-NEXT:    slli a6, a6, 1
+; RV32ZVBC32-NEXT:    mv s1, a6
+; RV32ZVBC32-NEXT:    slli a6, a6, 1
+; RV32ZVBC32-NEXT:    add s1, s1, a6
+; RV32ZVBC32-NEXT:    slli a6, a6, 1
+; RV32ZVBC32-NEXT:    add s1, s1, a6
+; RV32ZVBC32-NEXT:    slli a6, a6, 1
+; RV32ZVBC32-NEXT:    add a6, a6, s1
+; RV32ZVBC32-NEXT:    add a6, sp, a6
+; RV32ZVBC32-NEXT:    addi a6, a6, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a6) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi s1, sp, 32
+; RV32ZVBC32-NEXT:    vmul.vv v6, v8, v6
+; RV32ZVBC32-NEXT:    vxor.vv v20, v20, v6
+; RV32ZVBC32-NEXT:    vlse64.v v12, (s0), zero
+; RV32ZVBC32-NEXT:    csrr a6, vlenb
+; RV32ZVBC32-NEXT:    slli a6, a6, 2
+; RV32ZVBC32-NEXT:    mv s0, a6
+; RV32ZVBC32-NEXT:    slli a6, a6, 1
+; RV32ZVBC32-NEXT:    add s0, s0, a6
+; RV32ZVBC32-NEXT:    slli a6, a6, 1
+; RV32ZVBC32-NEXT:    add a6, a6, s0
+; RV32ZVBC32-NEXT:    add a6, sp, a6
+; RV32ZVBC32-NEXT:    addi a6, a6, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a6) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi s0, sp, 24
+; RV32ZVBC32-NEXT:    csrr s6, vlenb
+; RV32ZVBC32-NEXT:    slli s6, s6, 1
+; RV32ZVBC32-NEXT:    mv a6, s6
+; RV32ZVBC32-NEXT:    slli s6, s6, 1
+; RV32ZVBC32-NEXT:    add a6, a6, s6
+; RV32ZVBC32-NEXT:    slli s6, s6, 3
+; RV32ZVBC32-NEXT:    add s6, s6, a6
+; RV32ZVBC32-NEXT:    lw a6, 4(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    add s6, sp, s6
+; RV32ZVBC32-NEXT:    addi s6, s6, 288
+; RV32ZVBC32-NEXT:    vl2r.v v12, (s6) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v6, v8, v12
+; RV32ZVBC32-NEXT:    vmul.vv v2, v8, v2
+; RV32ZVBC32-NEXT:    vmul.vv v4, v8, v4
+; RV32ZVBC32-NEXT:    vmul.vv v30, v8, v30
+; RV32ZVBC32-NEXT:    vmul.vv v28, v8, v28
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vmul.vv v26, v8, v26
+; RV32ZVBC32-NEXT:    vxor.vv v20, v20, v6
+; RV32ZVBC32-NEXT:    addi s6, sp, 232
+; RV32ZVBC32-NEXT:    vlse64.v v0, (s6), zero
+; RV32ZVBC32-NEXT:    vxor.vv v20, v20, v2
+; RV32ZVBC32-NEXT:    vlse64.v v6, (t5), zero
+; RV32ZVBC32-NEXT:    vxor.vv v20, v20, v4
+; RV32ZVBC32-NEXT:    vlse64.v v22, (s8), zero
+; RV32ZVBC32-NEXT:    vxor.vv v20, v20, v30
+; RV32ZVBC32-NEXT:    vlse64.v v18, (s10), zero
+; RV32ZVBC32-NEXT:    vxor.vv v20, v20, v28
+; RV32ZVBC32-NEXT:    vlse64.v v16, (s11), zero
+; RV32ZVBC32-NEXT:    vxor.vv v20, v20, v24
+; RV32ZVBC32-NEXT:    vlse64.v v14, (s9), zero
+; RV32ZVBC32-NEXT:    vxor.vv v2, v20, v26
+; RV32ZVBC32-NEXT:    vlse64.v v12, (s4), zero
+; RV32ZVBC32-NEXT:    csrr t5, vlenb
+; RV32ZVBC32-NEXT:    slli t5, t5, 3
+; RV32ZVBC32-NEXT:    mv s4, t5
+; RV32ZVBC32-NEXT:    slli t5, t5, 2
+; RV32ZVBC32-NEXT:    add t5, t5, s4
+; RV32ZVBC32-NEXT:    add t5, sp, t5
+; RV32ZVBC32-NEXT:    addi t5, t5, 288
+; RV32ZVBC32-NEXT:    vl2r.v v20, (t5) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v26, v10, v20
+; RV32ZVBC32-NEXT:    csrr t5, vlenb
+; RV32ZVBC32-NEXT:    slli t5, t5, 2
+; RV32ZVBC32-NEXT:    mv s4, t5
+; RV32ZVBC32-NEXT:    slli t5, t5, 3
+; RV32ZVBC32-NEXT:    add t5, t5, s4
+; RV32ZVBC32-NEXT:    add t5, sp, t5
+; RV32ZVBC32-NEXT:    addi t5, t5, 288
+; RV32ZVBC32-NEXT:    vl2r.v v20, (t5) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v4, v10, v20
+; RV32ZVBC32-NEXT:    csrr t5, vlenb
+; RV32ZVBC32-NEXT:    slli t5, t5, 1
+; RV32ZVBC32-NEXT:    mv s4, t5
+; RV32ZVBC32-NEXT:    slli t5, t5, 4
+; RV32ZVBC32-NEXT:    add t5, t5, s4
+; RV32ZVBC32-NEXT:    add t5, sp, t5
+; RV32ZVBC32-NEXT:    addi t5, t5, 288
+; RV32ZVBC32-NEXT:    vl2r.v v20, (t5) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v30, v10, v20
+; RV32ZVBC32-NEXT:    csrr t5, vlenb
+; RV32ZVBC32-NEXT:    slli t5, t5, 5
+; RV32ZVBC32-NEXT:    add t5, sp, t5
+; RV32ZVBC32-NEXT:    addi t5, t5, 288
+; RV32ZVBC32-NEXT:    vl2r.v v20, (t5) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v20, v10, v20
+; RV32ZVBC32-NEXT:    csrr t5, vlenb
+; RV32ZVBC32-NEXT:    slli t5, t5, 1
+; RV32ZVBC32-NEXT:    mv s4, t5
+; RV32ZVBC32-NEXT:    slli t5, t5, 1
+; RV32ZVBC32-NEXT:    add s4, s4, t5
+; RV32ZVBC32-NEXT:    slli t5, t5, 1
+; RV32ZVBC32-NEXT:    add s4, s4, t5
+; RV32ZVBC32-NEXT:    slli t5, t5, 1
+; RV32ZVBC32-NEXT:    add t5, t5, s4
+; RV32ZVBC32-NEXT:    add t5, sp, t5
+; RV32ZVBC32-NEXT:    addi t5, t5, 288
+; RV32ZVBC32-NEXT:    vl2r.v v24, (t5) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v28, v10, v24
+; RV32ZVBC32-NEXT:    csrr t5, vlenb
+; RV32ZVBC32-NEXT:    slli t5, t5, 2
+; RV32ZVBC32-NEXT:    mv s4, t5
+; RV32ZVBC32-NEXT:    slli t5, t5, 1
+; RV32ZVBC32-NEXT:    add s4, s4, t5
+; RV32ZVBC32-NEXT:    slli t5, t5, 1
+; RV32ZVBC32-NEXT:    add t5, t5, s4
+; RV32ZVBC32-NEXT:    add t5, sp, t5
+; RV32ZVBC32-NEXT:    addi t5, t5, 288
+; RV32ZVBC32-NEXT:    vl2r.v v24, (t5) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v10, v24
+; RV32ZVBC32-NEXT:    vand.vv v0, v10, v0
+; RV32ZVBC32-NEXT:    vand.vv v6, v10, v6
+; RV32ZVBC32-NEXT:    vand.vv v22, v10, v22
+; RV32ZVBC32-NEXT:    vand.vv v18, v10, v18
+; RV32ZVBC32-NEXT:    csrr t5, vlenb
+; RV32ZVBC32-NEXT:    slli t5, t5, 3
+; RV32ZVBC32-NEXT:    add t5, sp, t5
+; RV32ZVBC32-NEXT:    addi t5, t5, 288
+; RV32ZVBC32-NEXT:    vs2r.v v18, (t5) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v16, v10, v16
+; RV32ZVBC32-NEXT:    csrr t5, vlenb
+; RV32ZVBC32-NEXT:    slli t5, t5, 2
+; RV32ZVBC32-NEXT:    mv s4, t5
+; RV32ZVBC32-NEXT:    slli t5, t5, 2
+; RV32ZVBC32-NEXT:    add t5, t5, s4
+; RV32ZVBC32-NEXT:    add t5, sp, t5
+; RV32ZVBC32-NEXT:    addi t5, t5, 288
+; RV32ZVBC32-NEXT:    vs2r.v v16, (t5) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v14, v10, v14
+; RV32ZVBC32-NEXT:    csrr t5, vlenb
+; RV32ZVBC32-NEXT:    slli t5, t5, 1
+; RV32ZVBC32-NEXT:    mv s4, t5
+; RV32ZVBC32-NEXT:    slli t5, t5, 1
+; RV32ZVBC32-NEXT:    add s4, s4, t5
+; RV32ZVBC32-NEXT:    slli t5, t5, 1
+; RV32ZVBC32-NEXT:    add s4, s4, t5
+; RV32ZVBC32-NEXT:    slli t5, t5, 1
+; RV32ZVBC32-NEXT:    add t5, t5, s4
+; RV32ZVBC32-NEXT:    add t5, sp, t5
+; RV32ZVBC32-NEXT:    addi t5, t5, 288
+; RV32ZVBC32-NEXT:    vs2r.v v14, (t5) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v10, v12
+; RV32ZVBC32-NEXT:    csrr t5, vlenb
+; RV32ZVBC32-NEXT:    slli t5, t5, 3
+; RV32ZVBC32-NEXT:    mv s4, t5
+; RV32ZVBC32-NEXT:    slli t5, t5, 2
+; RV32ZVBC32-NEXT:    add t5, t5, s4
+; RV32ZVBC32-NEXT:    add t5, sp, t5
+; RV32ZVBC32-NEXT:    addi t5, t5, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (t5) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v12, (t6), zero
+; RV32ZVBC32-NEXT:    vlse64.v v14, (t4), zero
+; RV32ZVBC32-NEXT:    vlse64.v v16, (t3), zero
+; RV32ZVBC32-NEXT:    vlse64.v v18, (t2), zero
+; RV32ZVBC32-NEXT:    vand.vv v12, v10, v12
+; RV32ZVBC32-NEXT:    csrr t2, vlenb
+; RV32ZVBC32-NEXT:    slli t2, t2, 1
+; RV32ZVBC32-NEXT:    mv t3, t2
+; RV32ZVBC32-NEXT:    slli t2, t2, 1
+; RV32ZVBC32-NEXT:    add t2, t2, t3
+; RV32ZVBC32-NEXT:    add t2, sp, t2
+; RV32ZVBC32-NEXT:    addi t2, t2, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (t2) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v10, v14
+; RV32ZVBC32-NEXT:    csrr t2, vlenb
+; RV32ZVBC32-NEXT:    slli t2, t2, 1
+; RV32ZVBC32-NEXT:    mv t3, t2
+; RV32ZVBC32-NEXT:    slli t2, t2, 3
+; RV32ZVBC32-NEXT:    add t2, t2, t3
+; RV32ZVBC32-NEXT:    add t2, sp, t2
+; RV32ZVBC32-NEXT:    addi t2, t2, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (t2) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v10, v16
+; RV32ZVBC32-NEXT:    csrr t2, vlenb
+; RV32ZVBC32-NEXT:    slli t2, t2, 2
+; RV32ZVBC32-NEXT:    mv t3, t2
+; RV32ZVBC32-NEXT:    slli t2, t2, 1
+; RV32ZVBC32-NEXT:    add t3, t3, t2
+; RV32ZVBC32-NEXT:    slli t2, t2, 1
+; RV32ZVBC32-NEXT:    add t2, t2, t3
+; RV32ZVBC32-NEXT:    add t2, sp, t2
+; RV32ZVBC32-NEXT:    addi t2, t2, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (t2) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v10, v18
+; RV32ZVBC32-NEXT:    csrr t2, vlenb
+; RV32ZVBC32-NEXT:    slli t2, t2, 1
+; RV32ZVBC32-NEXT:    mv t3, t2
+; RV32ZVBC32-NEXT:    slli t2, t2, 1
+; RV32ZVBC32-NEXT:    add t3, t3, t2
+; RV32ZVBC32-NEXT:    slli t2, t2, 3
+; RV32ZVBC32-NEXT:    add t2, t2, t3
+; RV32ZVBC32-NEXT:    add t2, sp, t2
+; RV32ZVBC32-NEXT:    addi t2, t2, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (t2) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v12, (t1), zero
+; RV32ZVBC32-NEXT:    vlse64.v v14, (t0), zero
+; RV32ZVBC32-NEXT:    vlse64.v v16, (ra), zero
+; RV32ZVBC32-NEXT:    vlse64.v v18, (a7), zero
+; RV32ZVBC32-NEXT:    vand.vv v12, v10, v12
+; RV32ZVBC32-NEXT:    csrr a7, vlenb
+; RV32ZVBC32-NEXT:    slli a7, a7, 2
+; RV32ZVBC32-NEXT:    add a7, sp, a7
+; RV32ZVBC32-NEXT:    addi a7, a7, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a7) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v10, v14
+; RV32ZVBC32-NEXT:    csrr a7, vlenb
+; RV32ZVBC32-NEXT:    slli a7, a7, 4
+; RV32ZVBC32-NEXT:    add a7, sp, a7
+; RV32ZVBC32-NEXT:    addi a7, a7, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a7) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v10, v16
+; RV32ZVBC32-NEXT:    csrr a7, vlenb
+; RV32ZVBC32-NEXT:    slli a7, a7, 1
+; RV32ZVBC32-NEXT:    mv t0, a7
+; RV32ZVBC32-NEXT:    slli a7, a7, 2
+; RV32ZVBC32-NEXT:    add t0, t0, a7
+; RV32ZVBC32-NEXT:    slli a7, a7, 1
+; RV32ZVBC32-NEXT:    add a7, a7, t0
+; RV32ZVBC32-NEXT:    add a7, sp, a7
+; RV32ZVBC32-NEXT:    addi a7, a7, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a7) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v10, v18
+; RV32ZVBC32-NEXT:    csrr a7, vlenb
+; RV32ZVBC32-NEXT:    slli a7, a7, 2
+; RV32ZVBC32-NEXT:    mv t0, a7
+; RV32ZVBC32-NEXT:    slli a7, a7, 3
+; RV32ZVBC32-NEXT:    add a7, a7, t0
+; RV32ZVBC32-NEXT:    add a7, sp, a7
+; RV32ZVBC32-NEXT:    addi a7, a7, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a7) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v12, (a6), zero
+; RV32ZVBC32-NEXT:    vlse64.v v14, (a5), zero
+; RV32ZVBC32-NEXT:    vlse64.v v16, (a4), zero
+; RV32ZVBC32-NEXT:    vlse64.v v18, (a3), zero
+; RV32ZVBC32-NEXT:    vand.vv v12, v10, v12
+; RV32ZVBC32-NEXT:    csrr a3, vlenb
+; RV32ZVBC32-NEXT:    slli a3, a3, 1
+; RV32ZVBC32-NEXT:    add a3, sp, a3
+; RV32ZVBC32-NEXT:    addi a3, a3, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a3) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v10, v14
+; RV32ZVBC32-NEXT:    csrr a3, vlenb
+; RV32ZVBC32-NEXT:    slli a3, a3, 1
+; RV32ZVBC32-NEXT:    mv a4, a3
+; RV32ZVBC32-NEXT:    slli a3, a3, 1
+; RV32ZVBC32-NEXT:    add a4, a4, a3
+; RV32ZVBC32-NEXT:    slli a3, a3, 1
+; RV32ZVBC32-NEXT:    add a3, a3, a4
+; RV32ZVBC32-NEXT:    add a3, sp, a3
+; RV32ZVBC32-NEXT:    addi a3, a3, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a3) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v10, v16
+; RV32ZVBC32-NEXT:    csrr a3, vlenb
+; RV32ZVBC32-NEXT:    slli a3, a3, 3
+; RV32ZVBC32-NEXT:    mv a4, a3
+; RV32ZVBC32-NEXT:    slli a3, a3, 1
+; RV32ZVBC32-NEXT:    add a3, a3, a4
+; RV32ZVBC32-NEXT:    add a3, sp, a3
+; RV32ZVBC32-NEXT:    addi a3, a3, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a3) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v10, v18
+; RV32ZVBC32-NEXT:    csrr a3, vlenb
+; RV32ZVBC32-NEXT:    slli a3, a3, 1
+; RV32ZVBC32-NEXT:    mv a4, a3
+; RV32ZVBC32-NEXT:    slli a3, a3, 4
+; RV32ZVBC32-NEXT:    add a3, a3, a4
+; RV32ZVBC32-NEXT:    add a3, sp, a3
+; RV32ZVBC32-NEXT:    addi a3, a3, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a3) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v12, (a2), zero
+; RV32ZVBC32-NEXT:    vlse64.v v14, (a1), zero
+; RV32ZVBC32-NEXT:    vlse64.v v16, (s7), zero
+; RV32ZVBC32-NEXT:    vlse64.v v18, (s2), zero
+; RV32ZVBC32-NEXT:    vand.vv v12, v10, v12
+; RV32ZVBC32-NEXT:    addi a1, sp, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a1) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v10, v14
+; RV32ZVBC32-NEXT:    csrr a1, vlenb
+; RV32ZVBC32-NEXT:    slli a1, a1, 2
+; RV32ZVBC32-NEXT:    mv a2, a1
+; RV32ZVBC32-NEXT:    slli a1, a1, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a2
+; RV32ZVBC32-NEXT:    add a1, sp, a1
+; RV32ZVBC32-NEXT:    addi a1, a1, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a1) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v10, v16
+; RV32ZVBC32-NEXT:    csrr a1, vlenb
+; RV32ZVBC32-NEXT:    slli a1, a1, 1
+; RV32ZVBC32-NEXT:    mv a2, a1
+; RV32ZVBC32-NEXT:    slli a1, a1, 1
+; RV32ZVBC32-NEXT:    add a2, a2, a1
+; RV32ZVBC32-NEXT:    slli a1, a1, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a2
+; RV32ZVBC32-NEXT:    add a1, sp, a1
+; RV32ZVBC32-NEXT:    addi a1, a1, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a1) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v10, v18
+; RV32ZVBC32-NEXT:    csrr a1, vlenb
+; RV32ZVBC32-NEXT:    slli a1, a1, 5
+; RV32ZVBC32-NEXT:    add a1, sp, a1
+; RV32ZVBC32-NEXT:    addi a1, a1, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a1) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v14, (s5), zero
+; RV32ZVBC32-NEXT:    vlse64.v v16, (s3), zero
+; RV32ZVBC32-NEXT:    vlse64.v v18, (s1), zero
+; RV32ZVBC32-NEXT:    vlse64.v v12, (s0), zero
+; RV32ZVBC32-NEXT:    vand.vv v14, v10, v14
+; RV32ZVBC32-NEXT:    vand.vv v16, v10, v16
+; RV32ZVBC32-NEXT:    csrr a1, vlenb
+; RV32ZVBC32-NEXT:    slli a1, a1, 1
+; RV32ZVBC32-NEXT:    mv a2, a1
+; RV32ZVBC32-NEXT:    slli a1, a1, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a2
+; RV32ZVBC32-NEXT:    add a1, sp, a1
+; RV32ZVBC32-NEXT:    addi a1, a1, 288
+; RV32ZVBC32-NEXT:    vs2r.v v16, (a1) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v18, v10, v18
+; RV32ZVBC32-NEXT:    vand.vv v16, v10, v12
+; RV32ZVBC32-NEXT:    vand.vx v10, v10, a0
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV32ZVBC32-NEXT:    vxor.vv v10, v2, v10
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v26
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v4
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v30
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v20
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v28
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v6
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v22
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    addi a0, sp, 288
+; RV32ZVBC32-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v14
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v18
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    vmul.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add sp, sp, a0
+; RV32ZVBC32-NEXT:    lw ra, 348(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s0, 344(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s1, 340(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s2, 336(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s3, 332(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s4, 328(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s5, 324(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s6, 320(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s7, 316(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s8, 312(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s9, 308(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s10, 304(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s11, 300(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    addi sp, sp, 352
+; RV32ZVBC32-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmul_nxv2i64_vv:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV64ZVBC-NEXT:    vclmul.vv v8, v8, v10
-; RV64ZVBC-NEXT:    ret
+; RV64ZVBC32-LABEL: clmul_nxv2i64_vv:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; RV64ZVBC32-NEXT:    vand.vi v12, v10, 2
+; RV64ZVBC32-NEXT:    vand.vi v14, v10, 1
+; RV64ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v14, v12
+; RV64ZVBC32-NEXT:    vand.vi v14, v10, 4
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vi v14, v10, 8
+; RV64ZVBC32-NEXT:    li a0, 16
+; RV64ZVBC32-NEXT:    li a1, 32
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    li a0, 64
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a1
+; RV64ZVBC32-NEXT:    li a1, 128
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    li a0, 256
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a1
+; RV64ZVBC32-NEXT:    li a1, 512
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    li a2, 1024
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a1
+; RV64ZVBC32-NEXT:    li a0, 1
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a2
+; RV64ZVBC32-NEXT:    slli a1, a0, 11
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a1
+; RV64ZVBC32-NEXT:    lui a1, 1
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a1
+; RV64ZVBC32-NEXT:    lui a1, 2
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a1
+; RV64ZVBC32-NEXT:    lui a1, 4
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a1
+; RV64ZVBC32-NEXT:    lui a1, 8
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a1
+; RV64ZVBC32-NEXT:    lui a1, 16
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a1
+; RV64ZVBC32-NEXT:    lui a1, 32
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a1
+; RV64ZVBC32-NEXT:    lui a1, 64
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a1
+; RV64ZVBC32-NEXT:    lui a1, 128
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a1
+; RV64ZVBC32-NEXT:    lui a1, 256
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a1
+; RV64ZVBC32-NEXT:    lui a1, 512
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a1
+; RV64ZVBC32-NEXT:    lui a1, 1024
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a1
+; RV64ZVBC32-NEXT:    lui a1, 2048
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a1
+; RV64ZVBC32-NEXT:    lui a1, 4096
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a1
+; RV64ZVBC32-NEXT:    lui a1, 8192
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a1
+; RV64ZVBC32-NEXT:    lui a1, 16384
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a1
+; RV64ZVBC32-NEXT:    lui a1, 32768
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a1
+; RV64ZVBC32-NEXT:    lui a1, 65536
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a1
+; RV64ZVBC32-NEXT:    lui a1, 131072
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a1
+; RV64ZVBC32-NEXT:    lui a1, 262144
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 31
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 32
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 33
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 34
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 35
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 36
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 37
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 38
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 39
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 40
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 41
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 42
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 43
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 44
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 45
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 46
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 47
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 48
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 49
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 50
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 51
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 52
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 53
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 54
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 55
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 56
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 57
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 58
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 59
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 60
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 61
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a1
+; RV64ZVBC32-NEXT:    li a1, -1
+; RV64ZVBC32-NEXT:    slli a0, a0, 62
+; RV64ZVBC32-NEXT:    slli a1, a1, 63
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC32-NEXT:    vand.vx v10, v10, a1
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vmul.vv v8, v8, v10
+; RV64ZVBC32-NEXT:    vxor.vv v8, v12, v8
+; RV64ZVBC32-NEXT:    ret
   %v = call <vscale x 2 x i64> @llvm.clmul.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb)
   ret <vscale x 2 x i64> %v
 }
@@ -9110,23 +15425,1099 @@ define <vscale x 2 x i64> @clmul_nxv2i64_vx(<vscale x 2 x i64> %va, i64 %b) noun
 ; RV64V-NEXT:    vxor.vv v8, v10, v8
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmul_nxv2i64_vx:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    addi sp, sp, -16
-; RV32ZVBC-NEXT:    sw a0, 8(sp)
-; RV32ZVBC-NEXT:    sw a1, 12(sp)
-; RV32ZVBC-NEXT:    addi a0, sp, 8
-; RV32ZVBC-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32ZVBC-NEXT:    vlse64.v v10, (a0), zero
-; RV32ZVBC-NEXT:    vclmul.vv v8, v8, v10
-; RV32ZVBC-NEXT:    addi sp, sp, 16
-; RV32ZVBC-NEXT:    ret
+; RV32ZVBC64-LABEL: clmul_nxv2i64_vx:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    addi sp, sp, -16
+; RV32ZVBC64-NEXT:    sw a0, 8(sp)
+; RV32ZVBC64-NEXT:    sw a1, 12(sp)
+; RV32ZVBC64-NEXT:    addi a0, sp, 8
+; RV32ZVBC64-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32ZVBC64-NEXT:    vlse64.v v10, (a0), zero
+; RV32ZVBC64-NEXT:    vclmul.vv v8, v8, v10
+; RV32ZVBC64-NEXT:    addi sp, sp, 16
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmul_nxv2i64_vx:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV64ZVBC64-NEXT:    vclmul.vx v8, v8, a0
+; RV64ZVBC64-NEXT:    ret
+;
+; RV32ZVBC32-LABEL: clmul_nxv2i64_vx:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    addi sp, sp, -352
+; RV32ZVBC32-NEXT:    sw ra, 348(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s0, 344(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s1, 340(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s2, 336(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s3, 332(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s4, 328(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s5, 324(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s6, 320(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s7, 316(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s8, 312(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s9, 308(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s10, 304(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s11, 300(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a2, vlenb
+; RV32ZVBC32-NEXT:    slli a2, a2, 2
+; RV32ZVBC32-NEXT:    mv a3, a2
+; RV32ZVBC32-NEXT:    slli a2, a2, 1
+; RV32ZVBC32-NEXT:    add a3, a3, a2
+; RV32ZVBC32-NEXT:    slli a2, a2, 2
+; RV32ZVBC32-NEXT:    add a2, a2, a3
+; RV32ZVBC32-NEXT:    sub sp, sp, a2
+; RV32ZVBC32-NEXT:    sw a0, 8(sp)
+; RV32ZVBC32-NEXT:    sw a1, 12(sp)
+; RV32ZVBC32-NEXT:    addi a0, sp, 8
+; RV32ZVBC32-NEXT:    lui s5, 524288
+; RV32ZVBC32-NEXT:    li s11, 1
+; RV32ZVBC32-NEXT:    li t2, 2
+; RV32ZVBC32-NEXT:    li s6, 4
+; RV32ZVBC32-NEXT:    li ra, 8
+; RV32ZVBC32-NEXT:    li s10, 128
+; RV32ZVBC32-NEXT:    li s9, 256
+; RV32ZVBC32-NEXT:    li s8, 512
+; RV32ZVBC32-NEXT:    li s7, 1024
+; RV32ZVBC32-NEXT:    lui s4, 1
+; RV32ZVBC32-NEXT:    lui s3, 2
+; RV32ZVBC32-NEXT:    lui s2, 4
+; RV32ZVBC32-NEXT:    lui s1, 8
+; RV32ZVBC32-NEXT:    lui s0, 16
+; RV32ZVBC32-NEXT:    lui t6, 32
+; RV32ZVBC32-NEXT:    lui t5, 64
+; RV32ZVBC32-NEXT:    lui t4, 128
+; RV32ZVBC32-NEXT:    lui t3, 256
+; RV32ZVBC32-NEXT:    lui t0, 512
+; RV32ZVBC32-NEXT:    lui a7, 1024
+; RV32ZVBC32-NEXT:    lui a6, 2048
+; RV32ZVBC32-NEXT:    lui a5, 4096
+; RV32ZVBC32-NEXT:    lui a4, 8192
+; RV32ZVBC32-NEXT:    lui a3, 16384
+; RV32ZVBC32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32ZVBC32-NEXT:    vlse64.v v10, (a0), zero
+; RV32ZVBC32-NEXT:    lui a2, 32768
+; RV32ZVBC32-NEXT:    sw s5, 16(sp)
+; RV32ZVBC32-NEXT:    lui t1, 524288
+; RV32ZVBC32-NEXT:    sw zero, 20(sp)
+; RV32ZVBC32-NEXT:    sw zero, 272(sp)
+; RV32ZVBC32-NEXT:    sw s11, 276(sp)
+; RV32ZVBC32-NEXT:    sw zero, 264(sp)
+; RV32ZVBC32-NEXT:    sw t2, 268(sp)
+; RV32ZVBC32-NEXT:    lui a0, 65536
+; RV32ZVBC32-NEXT:    sw zero, 256(sp)
+; RV32ZVBC32-NEXT:    sw s6, 260(sp)
+; RV32ZVBC32-NEXT:    lui t2, 131072
+; RV32ZVBC32-NEXT:    sw zero, 248(sp)
+; RV32ZVBC32-NEXT:    sw ra, 252(sp)
+; RV32ZVBC32-NEXT:    lui a1, 262144
+; RV32ZVBC32-NEXT:    sw zero, 240(sp)
+; RV32ZVBC32-NEXT:    li s6, 16
+; RV32ZVBC32-NEXT:    sw s6, 244(sp)
+; RV32ZVBC32-NEXT:    li s6, 16
+; RV32ZVBC32-NEXT:    sw zero, 232(sp)
+; RV32ZVBC32-NEXT:    li s5, 32
+; RV32ZVBC32-NEXT:    sw s5, 236(sp)
+; RV32ZVBC32-NEXT:    li s5, 32
+; RV32ZVBC32-NEXT:    sw zero, 224(sp)
+; RV32ZVBC32-NEXT:    li ra, 64
+; RV32ZVBC32-NEXT:    sw ra, 228(sp)
+; RV32ZVBC32-NEXT:    sw zero, 216(sp)
+; RV32ZVBC32-NEXT:    sw s10, 220(sp)
+; RV32ZVBC32-NEXT:    sw zero, 208(sp)
+; RV32ZVBC32-NEXT:    sw s9, 212(sp)
+; RV32ZVBC32-NEXT:    sw zero, 200(sp)
+; RV32ZVBC32-NEXT:    sw s8, 204(sp)
+; RV32ZVBC32-NEXT:    sw zero, 192(sp)
+; RV32ZVBC32-NEXT:    sw s7, 196(sp)
+; RV32ZVBC32-NEXT:    slli s11, s11, 11
+; RV32ZVBC32-NEXT:    sw zero, 184(sp)
+; RV32ZVBC32-NEXT:    sw s11, 188(sp)
+; RV32ZVBC32-NEXT:    sw zero, 176(sp)
+; RV32ZVBC32-NEXT:    sw s4, 180(sp)
+; RV32ZVBC32-NEXT:    sw zero, 168(sp)
+; RV32ZVBC32-NEXT:    sw s3, 172(sp)
+; RV32ZVBC32-NEXT:    sw zero, 160(sp)
+; RV32ZVBC32-NEXT:    sw s2, 164(sp)
+; RV32ZVBC32-NEXT:    sw zero, 152(sp)
+; RV32ZVBC32-NEXT:    sw s1, 156(sp)
+; RV32ZVBC32-NEXT:    sw zero, 144(sp)
+; RV32ZVBC32-NEXT:    sw s0, 148(sp)
+; RV32ZVBC32-NEXT:    sw zero, 136(sp)
+; RV32ZVBC32-NEXT:    sw t6, 140(sp)
+; RV32ZVBC32-NEXT:    sw zero, 128(sp)
+; RV32ZVBC32-NEXT:    sw t5, 132(sp)
+; RV32ZVBC32-NEXT:    sw zero, 120(sp)
+; RV32ZVBC32-NEXT:    sw t4, 124(sp)
+; RV32ZVBC32-NEXT:    sw zero, 112(sp)
+; RV32ZVBC32-NEXT:    sw t3, 116(sp)
+; RV32ZVBC32-NEXT:    sw zero, 104(sp)
+; RV32ZVBC32-NEXT:    sw t0, 108(sp)
+; RV32ZVBC32-NEXT:    sw zero, 96(sp)
+; RV32ZVBC32-NEXT:    sw a7, 100(sp)
+; RV32ZVBC32-NEXT:    sw zero, 88(sp)
+; RV32ZVBC32-NEXT:    sw a6, 92(sp)
+; RV32ZVBC32-NEXT:    sw zero, 80(sp)
+; RV32ZVBC32-NEXT:    sw a5, 84(sp)
+; RV32ZVBC32-NEXT:    sw zero, 72(sp)
+; RV32ZVBC32-NEXT:    sw a4, 76(sp)
+; RV32ZVBC32-NEXT:    sw zero, 64(sp)
+; RV32ZVBC32-NEXT:    sw a3, 68(sp)
+; RV32ZVBC32-NEXT:    sw zero, 56(sp)
+; RV32ZVBC32-NEXT:    sw a2, 60(sp)
+; RV32ZVBC32-NEXT:    sw zero, 48(sp)
+; RV32ZVBC32-NEXT:    sw a0, 52(sp)
+; RV32ZVBC32-NEXT:    sw zero, 40(sp)
+; RV32ZVBC32-NEXT:    sw t2, 44(sp)
+; RV32ZVBC32-NEXT:    sw zero, 32(sp)
+; RV32ZVBC32-NEXT:    sw a1, 36(sp)
+; RV32ZVBC32-NEXT:    sw zero, 24(sp)
+; RV32ZVBC32-NEXT:    sw t1, 28(sp)
+; RV32ZVBC32-NEXT:    addi a1, sp, 16
+; RV32ZVBC32-NEXT:    vlse64.v v12, (a1), zero
+; RV32ZVBC32-NEXT:    addi a1, sp, 272
+; RV32ZVBC32-NEXT:    vlse64.v v22, (a1), zero
+; RV32ZVBC32-NEXT:    addi a1, sp, 264
+; RV32ZVBC32-NEXT:    vlse64.v v14, (a1), zero
+; RV32ZVBC32-NEXT:    addi a1, sp, 256
+; RV32ZVBC32-NEXT:    vlse64.v v2, (a1), zero
+; RV32ZVBC32-NEXT:    addi a1, sp, 248
+; RV32ZVBC32-NEXT:    vlse64.v v26, (a1), zero
+; RV32ZVBC32-NEXT:    addi a1, sp, 240
+; RV32ZVBC32-NEXT:    vlse64.v v28, (a1), zero
+; RV32ZVBC32-NEXT:    addi a1, sp, 232
+; RV32ZVBC32-NEXT:    vlse64.v v30, (a1), zero
+; RV32ZVBC32-NEXT:    addi a1, sp, 224
+; RV32ZVBC32-NEXT:    vlse64.v v16, (a1), zero
+; RV32ZVBC32-NEXT:    csrr a1, vlenb
+; RV32ZVBC32-NEXT:    slli a1, a1, 1
+; RV32ZVBC32-NEXT:    mv a5, a1
+; RV32ZVBC32-NEXT:    slli a1, a1, 2
+; RV32ZVBC32-NEXT:    add a5, a5, a1
+; RV32ZVBC32-NEXT:    slli a1, a1, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a5
+; RV32ZVBC32-NEXT:    add a1, sp, a1
+; RV32ZVBC32-NEXT:    addi a1, a1, 288
+; RV32ZVBC32-NEXT:    vs2r.v v16, (a1) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi a1, sp, 216
+; RV32ZVBC32-NEXT:    vlse64.v v16, (a1), zero
+; RV32ZVBC32-NEXT:    csrr a1, vlenb
+; RV32ZVBC32-NEXT:    slli a1, a1, 3
+; RV32ZVBC32-NEXT:    mv a5, a1
+; RV32ZVBC32-NEXT:    slli a1, a1, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a5
+; RV32ZVBC32-NEXT:    add a1, sp, a1
+; RV32ZVBC32-NEXT:    addi a1, a1, 288
+; RV32ZVBC32-NEXT:    vs2r.v v16, (a1) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi a1, sp, 208
+; RV32ZVBC32-NEXT:    vlse64.v v16, (a1), zero
+; RV32ZVBC32-NEXT:    csrr a1, vlenb
+; RV32ZVBC32-NEXT:    slli a1, a1, 1
+; RV32ZVBC32-NEXT:    mv a5, a1
+; RV32ZVBC32-NEXT:    slli a1, a1, 1
+; RV32ZVBC32-NEXT:    add a5, a5, a1
+; RV32ZVBC32-NEXT:    slli a1, a1, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a5
+; RV32ZVBC32-NEXT:    add a1, sp, a1
+; RV32ZVBC32-NEXT:    addi a1, a1, 288
+; RV32ZVBC32-NEXT:    vs2r.v v16, (a1) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi a1, sp, 200
+; RV32ZVBC32-NEXT:    vlse64.v v6, (a1), zero
+; RV32ZVBC32-NEXT:    addi a1, sp, 192
+; RV32ZVBC32-NEXT:    vlse64.v v4, (a1), zero
+; RV32ZVBC32-NEXT:    addi ra, sp, 184
+; RV32ZVBC32-NEXT:    vand.vi v16, v10, 2
+; RV32ZVBC32-NEXT:    vand.vi v18, v10, 1
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vmul.vv v18, v8, v18
+; RV32ZVBC32-NEXT:    vxor.vv v16, v18, v16
+; RV32ZVBC32-NEXT:    vand.vi v18, v10, 4
+; RV32ZVBC32-NEXT:    vmul.vv v18, v8, v18
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v18
+; RV32ZVBC32-NEXT:    vand.vi v18, v10, 8
+; RV32ZVBC32-NEXT:    vmul.vv v18, v8, v18
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v18
+; RV32ZVBC32-NEXT:    vand.vx v18, v10, s6
+; RV32ZVBC32-NEXT:    addi s10, sp, 176
+; RV32ZVBC32-NEXT:    vmul.vv v18, v8, v18
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v18
+; RV32ZVBC32-NEXT:    vand.vx v18, v10, s5
+; RV32ZVBC32-NEXT:    addi s9, sp, 168
+; RV32ZVBC32-NEXT:    vmul.vv v18, v8, v18
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v18
+; RV32ZVBC32-NEXT:    li a1, 64
+; RV32ZVBC32-NEXT:    vand.vx v18, v10, a1
+; RV32ZVBC32-NEXT:    addi s8, sp, 160
+; RV32ZVBC32-NEXT:    vmul.vv v18, v8, v18
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v18
+; RV32ZVBC32-NEXT:    li a1, 128
+; RV32ZVBC32-NEXT:    vand.vx v18, v10, a1
+; RV32ZVBC32-NEXT:    addi s7, sp, 152
+; RV32ZVBC32-NEXT:    vmul.vv v18, v8, v18
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v18
+; RV32ZVBC32-NEXT:    li a1, 256
+; RV32ZVBC32-NEXT:    vand.vx v18, v10, a1
+; RV32ZVBC32-NEXT:    addi s6, sp, 144
+; RV32ZVBC32-NEXT:    vmul.vv v18, v8, v18
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v18
+; RV32ZVBC32-NEXT:    li a1, 512
+; RV32ZVBC32-NEXT:    vand.vx v18, v10, a1
+; RV32ZVBC32-NEXT:    addi s5, sp, 136
+; RV32ZVBC32-NEXT:    vmul.vv v18, v8, v18
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v18
+; RV32ZVBC32-NEXT:    li a1, 1024
+; RV32ZVBC32-NEXT:    vand.vx v18, v10, a1
+; RV32ZVBC32-NEXT:    addi s4, sp, 128
+; RV32ZVBC32-NEXT:    vmul.vv v18, v8, v18
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v18
+; RV32ZVBC32-NEXT:    vand.vx v18, v10, s11
+; RV32ZVBC32-NEXT:    addi s11, sp, 120
+; RV32ZVBC32-NEXT:    vmul.vv v18, v8, v18
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v18
+; RV32ZVBC32-NEXT:    lui a1, 1
+; RV32ZVBC32-NEXT:    vand.vx v18, v10, a1
+; RV32ZVBC32-NEXT:    addi s3, sp, 112
+; RV32ZVBC32-NEXT:    vmul.vv v18, v8, v18
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v18
+; RV32ZVBC32-NEXT:    lui a1, 2
+; RV32ZVBC32-NEXT:    vand.vx v18, v10, a1
+; RV32ZVBC32-NEXT:    addi s2, sp, 104
+; RV32ZVBC32-NEXT:    vmul.vv v18, v8, v18
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v18
+; RV32ZVBC32-NEXT:    lui a1, 4
+; RV32ZVBC32-NEXT:    vand.vx v18, v10, a1
+; RV32ZVBC32-NEXT:    addi s1, sp, 96
+; RV32ZVBC32-NEXT:    vmul.vv v18, v8, v18
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v18
+; RV32ZVBC32-NEXT:    lui a1, 8
+; RV32ZVBC32-NEXT:    vand.vx v18, v10, a1
+; RV32ZVBC32-NEXT:    addi s0, sp, 88
+; RV32ZVBC32-NEXT:    vmul.vv v18, v8, v18
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v18
+; RV32ZVBC32-NEXT:    lui a1, 16
+; RV32ZVBC32-NEXT:    vand.vx v18, v10, a1
+; RV32ZVBC32-NEXT:    addi t6, sp, 80
+; RV32ZVBC32-NEXT:    vmul.vv v18, v8, v18
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v18
+; RV32ZVBC32-NEXT:    lui a1, 32
+; RV32ZVBC32-NEXT:    vand.vx v18, v10, a1
+; RV32ZVBC32-NEXT:    addi t5, sp, 72
+; RV32ZVBC32-NEXT:    vmul.vv v18, v8, v18
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v18
+; RV32ZVBC32-NEXT:    lui a1, 64
+; RV32ZVBC32-NEXT:    vand.vx v18, v10, a1
+; RV32ZVBC32-NEXT:    addi t4, sp, 64
+; RV32ZVBC32-NEXT:    vmul.vv v18, v8, v18
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v18
+; RV32ZVBC32-NEXT:    lui a1, 128
+; RV32ZVBC32-NEXT:    vand.vx v18, v10, a1
+; RV32ZVBC32-NEXT:    addi t3, sp, 56
+; RV32ZVBC32-NEXT:    vmul.vv v18, v8, v18
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v18
+; RV32ZVBC32-NEXT:    lui a1, 256
+; RV32ZVBC32-NEXT:    vand.vx v18, v10, a1
+; RV32ZVBC32-NEXT:    addi t2, sp, 48
+; RV32ZVBC32-NEXT:    vmul.vv v18, v8, v18
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v18
+; RV32ZVBC32-NEXT:    vand.vx v18, v10, t0
+; RV32ZVBC32-NEXT:    addi t1, sp, 40
+; RV32ZVBC32-NEXT:    vmul.vv v18, v8, v18
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v18
+; RV32ZVBC32-NEXT:    vand.vx v18, v10, a7
+; RV32ZVBC32-NEXT:    addi a7, sp, 32
+; RV32ZVBC32-NEXT:    vmul.vv v18, v8, v18
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v18
+; RV32ZVBC32-NEXT:    vand.vx v18, v10, a6
+; RV32ZVBC32-NEXT:    addi a5, sp, 24
+; RV32ZVBC32-NEXT:    vmul.vv v18, v8, v18
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v18
+; RV32ZVBC32-NEXT:    lui a1, 4096
+; RV32ZVBC32-NEXT:    vand.vx v18, v10, a1
+; RV32ZVBC32-NEXT:    vmul.vv v18, v8, v18
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v18
+; RV32ZVBC32-NEXT:    vand.vx v18, v10, a4
+; RV32ZVBC32-NEXT:    vmul.vv v18, v8, v18
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v18
+; RV32ZVBC32-NEXT:    vand.vx v18, v10, a3
+; RV32ZVBC32-NEXT:    vmul.vv v18, v8, v18
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v18
+; RV32ZVBC32-NEXT:    vand.vx v18, v10, a2
+; RV32ZVBC32-NEXT:    vmul.vv v18, v8, v18
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v18
+; RV32ZVBC32-NEXT:    vand.vx v18, v10, a0
+; RV32ZVBC32-NEXT:    vmul.vv v18, v8, v18
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v18
+; RV32ZVBC32-NEXT:    lui a0, 131072
+; RV32ZVBC32-NEXT:    vand.vx v18, v10, a0
+; RV32ZVBC32-NEXT:    vmul.vv v18, v8, v18
+; RV32ZVBC32-NEXT:    vxor.vv v20, v16, v18
+; RV32ZVBC32-NEXT:    vlse64.v v0, (ra), zero
+; RV32ZVBC32-NEXT:    vand.vv v16, v10, v12
+; RV32ZVBC32-NEXT:    vand.vv v18, v10, v22
+; RV32ZVBC32-NEXT:    vand.vv v22, v10, v14
+; RV32ZVBC32-NEXT:    vand.vv v24, v10, v2
+; RV32ZVBC32-NEXT:    vand.vv v26, v10, v26
+; RV32ZVBC32-NEXT:    vand.vv v28, v10, v28
+; RV32ZVBC32-NEXT:    vand.vv v30, v10, v30
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v2, v10, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v12, v10, v12
+; RV32ZVBC32-NEXT:    addi a0, sp, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v12, v10, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v10, v6
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v10, v4
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v10, v0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v6, (s10), zero
+; RV32ZVBC32-NEXT:    vlse64.v v4, (s9), zero
+; RV32ZVBC32-NEXT:    vlse64.v v12, (s8), zero
+; RV32ZVBC32-NEXT:    vlse64.v v14, (s7), zero
+; RV32ZVBC32-NEXT:    vand.vv v0, v10, v6
+; RV32ZVBC32-NEXT:    vand.vv v6, v10, v4
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v6, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v10, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v10, v14
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v12, (s6), zero
+; RV32ZVBC32-NEXT:    vlse64.v v14, (s5), zero
+; RV32ZVBC32-NEXT:    vlse64.v v6, (s4), zero
+; RV32ZVBC32-NEXT:    vlse64.v v4, (s11), zero
+; RV32ZVBC32-NEXT:    vand.vv v12, v10, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v10, v14
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v10, v6
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v10, v4
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v12, (s3), zero
+; RV32ZVBC32-NEXT:    vlse64.v v14, (s2), zero
+; RV32ZVBC32-NEXT:    vlse64.v v6, (s1), zero
+; RV32ZVBC32-NEXT:    vlse64.v v4, (s0), zero
+; RV32ZVBC32-NEXT:    vand.vv v12, v10, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v10, v14
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v10, v6
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v10, v4
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v12, (t6), zero
+; RV32ZVBC32-NEXT:    vlse64.v v14, (t5), zero
+; RV32ZVBC32-NEXT:    vlse64.v v4, (t4), zero
+; RV32ZVBC32-NEXT:    vlse64.v v6, (t3), zero
+; RV32ZVBC32-NEXT:    vand.vv v12, v10, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v10, v14
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v10, v4
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v10, v6
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v14, (t2), zero
+; RV32ZVBC32-NEXT:    vlse64.v v6, (t1), zero
+; RV32ZVBC32-NEXT:    vlse64.v v12, (a7), zero
+; RV32ZVBC32-NEXT:    vlse64.v v4, (a5), zero
+; RV32ZVBC32-NEXT:    vand.vv v14, v10, v14
+; RV32ZVBC32-NEXT:    vand.vv v6, v10, v6
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v6, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v10, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v6, v10, v4
+; RV32ZVBC32-NEXT:    lui a0, 262144
+; RV32ZVBC32-NEXT:    vand.vx v10, v10, a0
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV32ZVBC32-NEXT:    vxor.vv v10, v20, v10
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v18
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v22
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v26
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v28
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v30
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v2
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    addi a0, sp, 288
+; RV32ZVBC32-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v14
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    vmul.vv v8, v8, v6
+; RV32ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add sp, sp, a0
+; RV32ZVBC32-NEXT:    lw ra, 348(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s0, 344(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s1, 340(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s2, 336(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s3, 332(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s4, 328(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s5, 324(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s6, 320(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s7, 316(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s8, 312(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s9, 308(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s10, 304(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s11, 300(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    addi sp, sp, 352
+; RV32ZVBC32-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmul_nxv2i64_vx:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV64ZVBC-NEXT:    vclmul.vx v8, v8, a0
-; RV64ZVBC-NEXT:    ret
+; RV64ZVBC32-LABEL: clmul_nxv2i64_vx:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    andi a1, a0, 2
+; RV64ZVBC32-NEXT:    andi a2, a0, 1
+; RV64ZVBC32-NEXT:    vsetvli a3, zero, e64, m2, ta, ma
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a1
+; RV64ZVBC32-NEXT:    andi a1, a0, 4
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, a2
+; RV64ZVBC32-NEXT:    andi a2, a0, 8
+; RV64ZVBC32-NEXT:    vxor.vv v10, v12, v10
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, a1
+; RV64ZVBC32-NEXT:    andi a1, a0, 16
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, a2
+; RV64ZVBC32-NEXT:    andi a2, a0, 32
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, a1
+; RV64ZVBC32-NEXT:    andi a1, a0, 64
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, a2
+; RV64ZVBC32-NEXT:    andi a2, a0, 128
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, a1
+; RV64ZVBC32-NEXT:    andi a1, a0, 256
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, a2
+; RV64ZVBC32-NEXT:    andi a2, a0, 512
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, a1
+; RV64ZVBC32-NEXT:    andi a3, a0, 1024
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, a2
+; RV64ZVBC32-NEXT:    li a1, 1
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, a3
+; RV64ZVBC32-NEXT:    slli a2, a1, 11
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 1
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 2
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 4
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 8
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 16
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 32
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 64
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 128
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 256
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 512
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 1024
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 2048
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 4096
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 8192
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 16384
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 32768
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 65536
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 131072
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 262144
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, a2
+; RV64ZVBC32-NEXT:    srliw a2, a0, 31
+; RV64ZVBC32-NEXT:    slli a2, a2, 31
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 32
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 33
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 34
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 35
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 36
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 37
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 38
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 39
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 40
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 41
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 42
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 43
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 44
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 45
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 46
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 47
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 48
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 49
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 50
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 51
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 52
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 53
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 54
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 55
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 56
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 57
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 58
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 59
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 60
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 61
+; RV64ZVBC32-NEXT:    slli a1, a1, 62
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    and a1, a0, a1
+; RV64ZVBC32-NEXT:    srli a0, a0, 63
+; RV64ZVBC32-NEXT:    slli a0, a0, 63
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, a2
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, a1
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vx v8, v8, a0
+; RV64ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x i64> poison, i64 %b, i32 0
   %vb = shufflevector <vscale x 2 x i64> %elt.head, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
   %v = call <vscale x 2 x i64> @llvm.clmul.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb)
@@ -10379,17 +17770,1262 @@ define <vscale x 4 x i64> @clmul_nxv4i64_vv(<vscale x 4 x i64> %va, <vscale x 4
 ; RV64V-NEXT:    vxor.vv v8, v16, v8
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmul_nxv4i64_vv:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV32ZVBC-NEXT:    vclmul.vv v8, v8, v12
-; RV32ZVBC-NEXT:    ret
+; RV32ZVBC64-LABEL: clmul_nxv4i64_vv:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV32ZVBC64-NEXT:    vclmul.vv v8, v8, v12
+; RV32ZVBC64-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmul_nxv4i64_vv:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV64ZVBC-NEXT:    vclmul.vv v8, v8, v12
-; RV64ZVBC-NEXT:    ret
+; RV64ZVBC64-LABEL: clmul_nxv4i64_vv:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV64ZVBC64-NEXT:    vclmul.vv v8, v8, v12
+; RV64ZVBC64-NEXT:    ret
+;
+; RV32ZVBC32-LABEL: clmul_nxv4i64_vv:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    addi sp, sp, -352
+; RV32ZVBC32-NEXT:    sw ra, 348(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s0, 344(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s1, 340(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s2, 336(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s3, 332(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s4, 328(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s5, 324(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s6, 320(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s7, 316(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s8, 312(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s9, 308(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s10, 304(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s11, 300(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    sub sp, sp, a0
+; RV32ZVBC32-NEXT:    lui a1, 524288
+; RV32ZVBC32-NEXT:    li s5, 1
+; RV32ZVBC32-NEXT:    li a3, 2
+; RV32ZVBC32-NEXT:    li a2, 4
+; RV32ZVBC32-NEXT:    li a0, 8
+; RV32ZVBC32-NEXT:    li s3, 16
+; RV32ZVBC32-NEXT:    li s2, 32
+; RV32ZVBC32-NEXT:    li s4, 64
+; RV32ZVBC32-NEXT:    li s6, 128
+; RV32ZVBC32-NEXT:    li s8, 256
+; RV32ZVBC32-NEXT:    li s0, 512
+; RV32ZVBC32-NEXT:    li s7, 1024
+; RV32ZVBC32-NEXT:    lui ra, 1
+; RV32ZVBC32-NEXT:    lui s11, 2
+; RV32ZVBC32-NEXT:    lui s10, 4
+; RV32ZVBC32-NEXT:    lui s9, 8
+; RV32ZVBC32-NEXT:    lui s1, 16
+; RV32ZVBC32-NEXT:    lui t6, 32
+; RV32ZVBC32-NEXT:    lui t5, 64
+; RV32ZVBC32-NEXT:    lui t4, 128
+; RV32ZVBC32-NEXT:    lui t3, 256
+; RV32ZVBC32-NEXT:    lui t2, 512
+; RV32ZVBC32-NEXT:    lui t1, 1024
+; RV32ZVBC32-NEXT:    lui t0, 2048
+; RV32ZVBC32-NEXT:    lui a7, 4096
+; RV32ZVBC32-NEXT:    lui a6, 8192
+; RV32ZVBC32-NEXT:    lui a5, 16384
+; RV32ZVBC32-NEXT:    lui a4, 32768
+; RV32ZVBC32-NEXT:    sw a1, 16(sp)
+; RV32ZVBC32-NEXT:    sw zero, 20(sp)
+; RV32ZVBC32-NEXT:    sw zero, 272(sp)
+; RV32ZVBC32-NEXT:    sw s5, 276(sp)
+; RV32ZVBC32-NEXT:    sw zero, 264(sp)
+; RV32ZVBC32-NEXT:    sw a3, 268(sp)
+; RV32ZVBC32-NEXT:    lui a3, 65536
+; RV32ZVBC32-NEXT:    sw zero, 256(sp)
+; RV32ZVBC32-NEXT:    sw a2, 260(sp)
+; RV32ZVBC32-NEXT:    lui a2, 131072
+; RV32ZVBC32-NEXT:    sw zero, 248(sp)
+; RV32ZVBC32-NEXT:    sw a0, 252(sp)
+; RV32ZVBC32-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV32ZVBC32-NEXT:    vand.vi v28, v12, 2
+; RV32ZVBC32-NEXT:    vand.vi v4, v12, 1
+; RV32ZVBC32-NEXT:    vand.vi v24, v12, 4
+; RV32ZVBC32-NEXT:    vand.vi v20, v12, 8
+; RV32ZVBC32-NEXT:    sw zero, 240(sp)
+; RV32ZVBC32-NEXT:    sw s3, 244(sp)
+; RV32ZVBC32-NEXT:    vand.vx v16, v12, s3
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v16, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi s3, sp, 16
+; RV32ZVBC32-NEXT:    sw zero, 232(sp)
+; RV32ZVBC32-NEXT:    sw s2, 236(sp)
+; RV32ZVBC32-NEXT:    vand.vx v0, v12, s2
+; RV32ZVBC32-NEXT:    addi s2, sp, 272
+; RV32ZVBC32-NEXT:    sw zero, 224(sp)
+; RV32ZVBC32-NEXT:    sw s4, 228(sp)
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v28
+; RV32ZVBC32-NEXT:    vmul.vv v28, v8, v4
+; RV32ZVBC32-NEXT:    vxor.vv v28, v28, v16
+; RV32ZVBC32-NEXT:    vand.vx v16, v12, s4
+; RV32ZVBC32-NEXT:    addi s4, sp, 264
+; RV32ZVBC32-NEXT:    sw zero, 216(sp)
+; RV32ZVBC32-NEXT:    sw s6, 220(sp)
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v28, v28, v24
+; RV32ZVBC32-NEXT:    vand.vx v24, v12, s6
+; RV32ZVBC32-NEXT:    addi s6, sp, 256
+; RV32ZVBC32-NEXT:    sw zero, 208(sp)
+; RV32ZVBC32-NEXT:    sw s8, 212(sp)
+; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC32-NEXT:    vxor.vv v20, v28, v20
+; RV32ZVBC32-NEXT:    vand.vx v28, v12, s8
+; RV32ZVBC32-NEXT:    addi s8, sp, 248
+; RV32ZVBC32-NEXT:    sw zero, 200(sp)
+; RV32ZVBC32-NEXT:    sw s0, 204(sp)
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v4, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v4, v8, v4
+; RV32ZVBC32-NEXT:    vxor.vv v20, v20, v4
+; RV32ZVBC32-NEXT:    vand.vx v4, v12, s0
+; RV32ZVBC32-NEXT:    sw zero, 192(sp)
+; RV32ZVBC32-NEXT:    sw s7, 196(sp)
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v20, v20, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v12, s7
+; RV32ZVBC32-NEXT:    slli a0, s5, 11
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v20, v20, v16
+; RV32ZVBC32-NEXT:    vand.vx v16, v12, ra
+; RV32ZVBC32-NEXT:    sw zero, 184(sp)
+; RV32ZVBC32-NEXT:    sw a0, 188(sp)
+; RV32ZVBC32-NEXT:    sw zero, 176(sp)
+; RV32ZVBC32-NEXT:    sw ra, 180(sp)
+; RV32ZVBC32-NEXT:    addi s5, sp, 224
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v24, v20, v24
+; RV32ZVBC32-NEXT:    vand.vx v20, v12, s11
+; RV32ZVBC32-NEXT:    sw zero, 168(sp)
+; RV32ZVBC32-NEXT:    sw s11, 172(sp)
+; RV32ZVBC32-NEXT:    addi s11, sp, 216
+; RV32ZVBC32-NEXT:    vmul.vv v28, v8, v28
+; RV32ZVBC32-NEXT:    vxor.vv v28, v24, v28
+; RV32ZVBC32-NEXT:    vand.vx v24, v12, s10
+; RV32ZVBC32-NEXT:    sw zero, 160(sp)
+; RV32ZVBC32-NEXT:    sw s10, 164(sp)
+; RV32ZVBC32-NEXT:    addi s10, sp, 208
+; RV32ZVBC32-NEXT:    vmul.vv v4, v8, v4
+; RV32ZVBC32-NEXT:    vxor.vv v4, v28, v4
+; RV32ZVBC32-NEXT:    vand.vx v28, v12, s9
+; RV32ZVBC32-NEXT:    sw zero, 152(sp)
+; RV32ZVBC32-NEXT:    sw s9, 156(sp)
+; RV32ZVBC32-NEXT:    addi s9, sp, 200
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v4, v4, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v12, a0
+; RV32ZVBC32-NEXT:    addi ra, sp, 192
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v0, v4, v0
+; RV32ZVBC32-NEXT:    vand.vx v4, v12, s1
+; RV32ZVBC32-NEXT:    sw zero, 144(sp)
+; RV32ZVBC32-NEXT:    sw s1, 148(sp)
+; RV32ZVBC32-NEXT:    addi s1, sp, 184
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v0, v0, v16
+; RV32ZVBC32-NEXT:    vand.vx v16, v12, t6
+; RV32ZVBC32-NEXT:    sw zero, 136(sp)
+; RV32ZVBC32-NEXT:    sw t6, 140(sp)
+; RV32ZVBC32-NEXT:    addi s0, sp, 176
+; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC32-NEXT:    vxor.vv v0, v0, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v12, t5
+; RV32ZVBC32-NEXT:    sw zero, 128(sp)
+; RV32ZVBC32-NEXT:    sw t5, 132(sp)
+; RV32ZVBC32-NEXT:    addi t6, sp, 168
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v0, v0, v24
+; RV32ZVBC32-NEXT:    vand.vx v24, v12, t4
+; RV32ZVBC32-NEXT:    sw zero, 120(sp)
+; RV32ZVBC32-NEXT:    sw t4, 124(sp)
+; RV32ZVBC32-NEXT:    addi t5, sp, 160
+; RV32ZVBC32-NEXT:    vmul.vv v28, v8, v28
+; RV32ZVBC32-NEXT:    vxor.vv v0, v0, v28
+; RV32ZVBC32-NEXT:    vand.vx v28, v12, t3
+; RV32ZVBC32-NEXT:    sw zero, 112(sp)
+; RV32ZVBC32-NEXT:    sw t3, 116(sp)
+; RV32ZVBC32-NEXT:    addi t4, sp, 152
+; RV32ZVBC32-NEXT:    vmul.vv v4, v8, v4
+; RV32ZVBC32-NEXT:    vxor.vv v0, v0, v4
+; RV32ZVBC32-NEXT:    vand.vx v4, v12, t2
+; RV32ZVBC32-NEXT:    sw zero, 104(sp)
+; RV32ZVBC32-NEXT:    sw t2, 108(sp)
+; RV32ZVBC32-NEXT:    addi t3, sp, 144
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v16, v0, v16
+; RV32ZVBC32-NEXT:    vand.vx v0, v12, t1
+; RV32ZVBC32-NEXT:    sw zero, 96(sp)
+; RV32ZVBC32-NEXT:    sw t1, 100(sp)
+; RV32ZVBC32-NEXT:    addi t2, sp, 136
+; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC32-NEXT:    vxor.vv v20, v16, v20
+; RV32ZVBC32-NEXT:    vand.vx v16, v12, t0
+; RV32ZVBC32-NEXT:    sw zero, 88(sp)
+; RV32ZVBC32-NEXT:    sw t0, 92(sp)
+; RV32ZVBC32-NEXT:    addi t1, sp, 128
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v24, v20, v24
+; RV32ZVBC32-NEXT:    vand.vx v20, v12, a7
+; RV32ZVBC32-NEXT:    sw zero, 80(sp)
+; RV32ZVBC32-NEXT:    sw a7, 84(sp)
+; RV32ZVBC32-NEXT:    addi t0, sp, 120
+; RV32ZVBC32-NEXT:    vmul.vv v28, v8, v28
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v28
+; RV32ZVBC32-NEXT:    vand.vx v28, v12, a6
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v28, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw zero, 72(sp)
+; RV32ZVBC32-NEXT:    sw a6, 76(sp)
+; RV32ZVBC32-NEXT:    addi a7, sp, 112
+; RV32ZVBC32-NEXT:    vmul.vv v28, v8, v4
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v28
+; RV32ZVBC32-NEXT:    vand.vx v28, v12, a5
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v28, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw zero, 64(sp)
+; RV32ZVBC32-NEXT:    sw a5, 68(sp)
+; RV32ZVBC32-NEXT:    addi a6, sp, 104
+; RV32ZVBC32-NEXT:    vmul.vv v28, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v28, v24, v28
+; RV32ZVBC32-NEXT:    vand.vx v24, v12, a4
+; RV32ZVBC32-NEXT:    sw zero, 56(sp)
+; RV32ZVBC32-NEXT:    sw a4, 60(sp)
+; RV32ZVBC32-NEXT:    addi a5, sp, 96
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v16, v28, v16
+; RV32ZVBC32-NEXT:    vand.vx v28, v12, a3
+; RV32ZVBC32-NEXT:    sw zero, 48(sp)
+; RV32ZVBC32-NEXT:    sw a3, 52(sp)
+; RV32ZVBC32-NEXT:    addi a4, sp, 88
+; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    vand.vx v4, v12, a2
+; RV32ZVBC32-NEXT:    sw zero, 40(sp)
+; RV32ZVBC32-NEXT:    sw a2, 44(sp)
+; RV32ZVBC32-NEXT:    addi a3, sp, 80
+; RV32ZVBC32-NEXT:    sw zero, 32(sp)
+; RV32ZVBC32-NEXT:    lui a1, 262144
+; RV32ZVBC32-NEXT:    sw a1, 36(sp)
+; RV32ZVBC32-NEXT:    sw zero, 24(sp)
+; RV32ZVBC32-NEXT:    lui a0, 524288
+; RV32ZVBC32-NEXT:    sw a0, 28(sp)
+; RV32ZVBC32-NEXT:    addi a2, sp, 72
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv s7, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add s7, s7, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add s7, s7, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, s7
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v20, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC32-NEXT:    vxor.vv v20, v16, v20
+; RV32ZVBC32-NEXT:    vlse64.v v16, (s3), zero
+; RV32ZVBC32-NEXT:    addi s3, sp, 64
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv s7, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add s7, s7, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, s7
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v0, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v0, v20, v0
+; RV32ZVBC32-NEXT:    vlse64.v v20, (s2), zero
+; RV32ZVBC32-NEXT:    addi s2, sp, 56
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v0, v0, v24
+; RV32ZVBC32-NEXT:    vlse64.v v24, (s4), zero
+; RV32ZVBC32-NEXT:    addi s4, sp, 48
+; RV32ZVBC32-NEXT:    vmul.vv v28, v8, v28
+; RV32ZVBC32-NEXT:    vxor.vv v0, v0, v28
+; RV32ZVBC32-NEXT:    vlse64.v v28, (s6), zero
+; RV32ZVBC32-NEXT:    addi s6, sp, 40
+; RV32ZVBC32-NEXT:    vmul.vv v4, v8, v4
+; RV32ZVBC32-NEXT:    vxor.vv v4, v0, v4
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv s7, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add s7, s7, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add s7, s7, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, s7
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v4, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v4, (s8), zero
+; RV32ZVBC32-NEXT:    addi s8, sp, 32
+; RV32ZVBC32-NEXT:    vand.vv v16, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv s7, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, s7
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v16, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v16, v12, v20
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv s7, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add s7, s7, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, s7
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v16, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v16, v12, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv s7, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add s7, s7, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, s7
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v16, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v16, v12, v28
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv s7, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add s7, s7, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add s7, s7, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, s7
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v16, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v16, v12, v4
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv s7, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add s7, s7, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, s7
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v16, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi a0, sp, 24
+; RV32ZVBC32-NEXT:    addi s7, sp, 240
+; RV32ZVBC32-NEXT:    vlse64.v v16, (s7), zero
+; RV32ZVBC32-NEXT:    addi s7, sp, 232
+; RV32ZVBC32-NEXT:    vlse64.v v20, (s7), zero
+; RV32ZVBC32-NEXT:    vlse64.v v24, (s5), zero
+; RV32ZVBC32-NEXT:    vlse64.v v28, (s11), zero
+; RV32ZVBC32-NEXT:    vand.vv v16, v12, v16
+; RV32ZVBC32-NEXT:    csrr s5, vlenb
+; RV32ZVBC32-NEXT:    slli s5, s5, 4
+; RV32ZVBC32-NEXT:    add s5, sp, s5
+; RV32ZVBC32-NEXT:    addi s5, s5, 288
+; RV32ZVBC32-NEXT:    vs4r.v v16, (s5) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v16, v12, v20
+; RV32ZVBC32-NEXT:    csrr s5, vlenb
+; RV32ZVBC32-NEXT:    slli s5, s5, 2
+; RV32ZVBC32-NEXT:    mv s7, s5
+; RV32ZVBC32-NEXT:    slli s5, s5, 1
+; RV32ZVBC32-NEXT:    add s7, s7, s5
+; RV32ZVBC32-NEXT:    slli s5, s5, 2
+; RV32ZVBC32-NEXT:    add s5, s5, s7
+; RV32ZVBC32-NEXT:    add s5, sp, s5
+; RV32ZVBC32-NEXT:    addi s5, s5, 288
+; RV32ZVBC32-NEXT:    vs4r.v v16, (s5) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v16, v12, v24
+; RV32ZVBC32-NEXT:    csrr s5, vlenb
+; RV32ZVBC32-NEXT:    slli s5, s5, 2
+; RV32ZVBC32-NEXT:    mv s7, s5
+; RV32ZVBC32-NEXT:    slli s5, s5, 4
+; RV32ZVBC32-NEXT:    add s5, s5, s7
+; RV32ZVBC32-NEXT:    add s5, sp, s5
+; RV32ZVBC32-NEXT:    addi s5, s5, 288
+; RV32ZVBC32-NEXT:    vs4r.v v16, (s5) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v16, v12, v28
+; RV32ZVBC32-NEXT:    csrr s5, vlenb
+; RV32ZVBC32-NEXT:    slli s5, s5, 2
+; RV32ZVBC32-NEXT:    mv s7, s5
+; RV32ZVBC32-NEXT:    slli s5, s5, 1
+; RV32ZVBC32-NEXT:    add s7, s7, s5
+; RV32ZVBC32-NEXT:    slli s5, s5, 1
+; RV32ZVBC32-NEXT:    add s7, s7, s5
+; RV32ZVBC32-NEXT:    slli s5, s5, 2
+; RV32ZVBC32-NEXT:    add s5, s5, s7
+; RV32ZVBC32-NEXT:    add s5, sp, s5
+; RV32ZVBC32-NEXT:    addi s5, s5, 288
+; RV32ZVBC32-NEXT:    vs4r.v v16, (s5) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v20, (s10), zero
+; RV32ZVBC32-NEXT:    vlse64.v v24, (s9), zero
+; RV32ZVBC32-NEXT:    vlse64.v v28, (ra), zero
+; RV32ZVBC32-NEXT:    vlse64.v v4, (s1), zero
+; RV32ZVBC32-NEXT:    vand.vv v16, v12, v20
+; RV32ZVBC32-NEXT:    csrr s1, vlenb
+; RV32ZVBC32-NEXT:    slli s1, s1, 2
+; RV32ZVBC32-NEXT:    mv s5, s1
+; RV32ZVBC32-NEXT:    slli s1, s1, 1
+; RV32ZVBC32-NEXT:    add s1, s1, s5
+; RV32ZVBC32-NEXT:    add s1, sp, s1
+; RV32ZVBC32-NEXT:    addi s1, s1, 288
+; RV32ZVBC32-NEXT:    vs4r.v v16, (s1) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v16, v12, v24
+; RV32ZVBC32-NEXT:    csrr s1, vlenb
+; RV32ZVBC32-NEXT:    slli s1, s1, 3
+; RV32ZVBC32-NEXT:    mv s5, s1
+; RV32ZVBC32-NEXT:    slli s1, s1, 2
+; RV32ZVBC32-NEXT:    add s1, s1, s5
+; RV32ZVBC32-NEXT:    add s1, sp, s1
+; RV32ZVBC32-NEXT:    addi s1, s1, 288
+; RV32ZVBC32-NEXT:    vs4r.v v16, (s1) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v16, v12, v28
+; RV32ZVBC32-NEXT:    csrr s1, vlenb
+; RV32ZVBC32-NEXT:    slli s1, s1, 6
+; RV32ZVBC32-NEXT:    add s1, sp, s1
+; RV32ZVBC32-NEXT:    addi s1, s1, 288
+; RV32ZVBC32-NEXT:    vs4r.v v16, (s1) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v16, v12, v4
+; RV32ZVBC32-NEXT:    csrr s1, vlenb
+; RV32ZVBC32-NEXT:    slli s1, s1, 3
+; RV32ZVBC32-NEXT:    mv s5, s1
+; RV32ZVBC32-NEXT:    slli s1, s1, 1
+; RV32ZVBC32-NEXT:    add s5, s5, s1
+; RV32ZVBC32-NEXT:    slli s1, s1, 2
+; RV32ZVBC32-NEXT:    add s1, s1, s5
+; RV32ZVBC32-NEXT:    add s1, sp, s1
+; RV32ZVBC32-NEXT:    addi s1, s1, 288
+; RV32ZVBC32-NEXT:    vs4r.v v16, (s1) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v24, (s0), zero
+; RV32ZVBC32-NEXT:    vlse64.v v28, (t6), zero
+; RV32ZVBC32-NEXT:    vlse64.v v4, (t5), zero
+; RV32ZVBC32-NEXT:    vlse64.v v0, (t4), zero
+; RV32ZVBC32-NEXT:    vand.vv v16, v12, v24
+; RV32ZVBC32-NEXT:    csrr t4, vlenb
+; RV32ZVBC32-NEXT:    slli t4, t4, 3
+; RV32ZVBC32-NEXT:    add t4, sp, t4
+; RV32ZVBC32-NEXT:    addi t4, t4, 288
+; RV32ZVBC32-NEXT:    vs4r.v v16, (t4) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v16, v12, v28
+; RV32ZVBC32-NEXT:    csrr t4, vlenb
+; RV32ZVBC32-NEXT:    slli t4, t4, 2
+; RV32ZVBC32-NEXT:    mv t5, t4
+; RV32ZVBC32-NEXT:    slli t4, t4, 3
+; RV32ZVBC32-NEXT:    add t4, t4, t5
+; RV32ZVBC32-NEXT:    add t4, sp, t4
+; RV32ZVBC32-NEXT:    addi t4, t4, 288
+; RV32ZVBC32-NEXT:    vs4r.v v16, (t4) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v16, v12, v4
+; RV32ZVBC32-NEXT:    csrr t4, vlenb
+; RV32ZVBC32-NEXT:    slli t4, t4, 2
+; RV32ZVBC32-NEXT:    mv t5, t4
+; RV32ZVBC32-NEXT:    slli t4, t4, 1
+; RV32ZVBC32-NEXT:    add t5, t5, t4
+; RV32ZVBC32-NEXT:    slli t4, t4, 1
+; RV32ZVBC32-NEXT:    add t5, t5, t4
+; RV32ZVBC32-NEXT:    slli t4, t4, 1
+; RV32ZVBC32-NEXT:    add t4, t4, t5
+; RV32ZVBC32-NEXT:    add t4, sp, t4
+; RV32ZVBC32-NEXT:    addi t4, t4, 288
+; RV32ZVBC32-NEXT:    vs4r.v v16, (t4) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v16, v12, v0
+; RV32ZVBC32-NEXT:    csrr t4, vlenb
+; RV32ZVBC32-NEXT:    slli t4, t4, 2
+; RV32ZVBC32-NEXT:    mv t5, t4
+; RV32ZVBC32-NEXT:    slli t4, t4, 2
+; RV32ZVBC32-NEXT:    add t5, t5, t4
+; RV32ZVBC32-NEXT:    slli t4, t4, 2
+; RV32ZVBC32-NEXT:    add t4, t4, t5
+; RV32ZVBC32-NEXT:    add t4, sp, t4
+; RV32ZVBC32-NEXT:    addi t4, t4, 288
+; RV32ZVBC32-NEXT:    vs4r.v v16, (t4) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v28, (t3), zero
+; RV32ZVBC32-NEXT:    vlse64.v v4, (t2), zero
+; RV32ZVBC32-NEXT:    vlse64.v v0, (t1), zero
+; RV32ZVBC32-NEXT:    vlse64.v v16, (t0), zero
+; RV32ZVBC32-NEXT:    vand.vv v20, v12, v28
+; RV32ZVBC32-NEXT:    csrr t0, vlenb
+; RV32ZVBC32-NEXT:    slli t0, t0, 2
+; RV32ZVBC32-NEXT:    add t0, sp, t0
+; RV32ZVBC32-NEXT:    addi t0, t0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v20, (t0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v20, v12, v4
+; RV32ZVBC32-NEXT:    csrr t0, vlenb
+; RV32ZVBC32-NEXT:    slli t0, t0, 5
+; RV32ZVBC32-NEXT:    add t0, sp, t0
+; RV32ZVBC32-NEXT:    addi t0, t0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v20, (t0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v20, v12, v0
+; RV32ZVBC32-NEXT:    csrr t0, vlenb
+; RV32ZVBC32-NEXT:    slli t0, t0, 3
+; RV32ZVBC32-NEXT:    mv t1, t0
+; RV32ZVBC32-NEXT:    slli t0, t0, 1
+; RV32ZVBC32-NEXT:    add t1, t1, t0
+; RV32ZVBC32-NEXT:    slli t0, t0, 1
+; RV32ZVBC32-NEXT:    add t0, t0, t1
+; RV32ZVBC32-NEXT:    add t0, sp, t0
+; RV32ZVBC32-NEXT:    addi t0, t0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v20, (t0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v16, v12, v16
+; RV32ZVBC32-NEXT:    csrr t0, vlenb
+; RV32ZVBC32-NEXT:    slli t0, t0, 4
+; RV32ZVBC32-NEXT:    mv t1, t0
+; RV32ZVBC32-NEXT:    slli t0, t0, 2
+; RV32ZVBC32-NEXT:    add t0, t0, t1
+; RV32ZVBC32-NEXT:    add t0, sp, t0
+; RV32ZVBC32-NEXT:    addi t0, t0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v16, (t0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v16, (a7), zero
+; RV32ZVBC32-NEXT:    vlse64.v v0, (a6), zero
+; RV32ZVBC32-NEXT:    vlse64.v v20, (a5), zero
+; RV32ZVBC32-NEXT:    vlse64.v v24, (a4), zero
+; RV32ZVBC32-NEXT:    vand.vv v4, v12, v16
+; RV32ZVBC32-NEXT:    vand.vv v16, v12, v0
+; RV32ZVBC32-NEXT:    csrr a4, vlenb
+; RV32ZVBC32-NEXT:    slli a4, a4, 2
+; RV32ZVBC32-NEXT:    mv a5, a4
+; RV32ZVBC32-NEXT:    slli a4, a4, 1
+; RV32ZVBC32-NEXT:    add a5, a5, a4
+; RV32ZVBC32-NEXT:    slli a4, a4, 1
+; RV32ZVBC32-NEXT:    add a4, a4, a5
+; RV32ZVBC32-NEXT:    add a4, sp, a4
+; RV32ZVBC32-NEXT:    addi a4, a4, 288
+; RV32ZVBC32-NEXT:    vs4r.v v16, (a4) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v16, v12, v20
+; RV32ZVBC32-NEXT:    csrr a4, vlenb
+; RV32ZVBC32-NEXT:    slli a4, a4, 2
+; RV32ZVBC32-NEXT:    mv a5, a4
+; RV32ZVBC32-NEXT:    slli a4, a4, 2
+; RV32ZVBC32-NEXT:    add a5, a5, a4
+; RV32ZVBC32-NEXT:    slli a4, a4, 1
+; RV32ZVBC32-NEXT:    add a4, a4, a5
+; RV32ZVBC32-NEXT:    add a4, sp, a4
+; RV32ZVBC32-NEXT:    addi a4, a4, 288
+; RV32ZVBC32-NEXT:    vs4r.v v16, (a4) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v16, v12, v24
+; RV32ZVBC32-NEXT:    csrr a4, vlenb
+; RV32ZVBC32-NEXT:    slli a4, a4, 2
+; RV32ZVBC32-NEXT:    mv a5, a4
+; RV32ZVBC32-NEXT:    slli a4, a4, 1
+; RV32ZVBC32-NEXT:    add a5, a5, a4
+; RV32ZVBC32-NEXT:    slli a4, a4, 3
+; RV32ZVBC32-NEXT:    add a4, a4, a5
+; RV32ZVBC32-NEXT:    add a4, sp, a4
+; RV32ZVBC32-NEXT:    addi a4, a4, 288
+; RV32ZVBC32-NEXT:    vs4r.v v16, (a4) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v16, (a3), zero
+; RV32ZVBC32-NEXT:    vlse64.v v20, (a2), zero
+; RV32ZVBC32-NEXT:    vlse64.v v24, (s3), zero
+; RV32ZVBC32-NEXT:    vlse64.v v28, (s2), zero
+; RV32ZVBC32-NEXT:    vand.vv v0, v12, v16
+; RV32ZVBC32-NEXT:    vand.vv v16, v12, v20
+; RV32ZVBC32-NEXT:    csrr a2, vlenb
+; RV32ZVBC32-NEXT:    slli a2, a2, 3
+; RV32ZVBC32-NEXT:    mv a3, a2
+; RV32ZVBC32-NEXT:    slli a2, a2, 1
+; RV32ZVBC32-NEXT:    add a2, a2, a3
+; RV32ZVBC32-NEXT:    add a2, sp, a2
+; RV32ZVBC32-NEXT:    addi a2, a2, 288
+; RV32ZVBC32-NEXT:    vs4r.v v16, (a2) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v16, v12, v24
+; RV32ZVBC32-NEXT:    csrr a2, vlenb
+; RV32ZVBC32-NEXT:    slli a2, a2, 4
+; RV32ZVBC32-NEXT:    mv a3, a2
+; RV32ZVBC32-NEXT:    slli a2, a2, 1
+; RV32ZVBC32-NEXT:    add a2, a2, a3
+; RV32ZVBC32-NEXT:    add a2, sp, a2
+; RV32ZVBC32-NEXT:    addi a2, a2, 288
+; RV32ZVBC32-NEXT:    vs4r.v v16, (a2) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v16, v12, v28
+; RV32ZVBC32-NEXT:    csrr a2, vlenb
+; RV32ZVBC32-NEXT:    slli a2, a2, 3
+; RV32ZVBC32-NEXT:    mv a3, a2
+; RV32ZVBC32-NEXT:    slli a2, a2, 3
+; RV32ZVBC32-NEXT:    add a2, a2, a3
+; RV32ZVBC32-NEXT:    add a2, sp, a2
+; RV32ZVBC32-NEXT:    addi a2, a2, 288
+; RV32ZVBC32-NEXT:    vs4r.v v16, (a2) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v16, (s4), zero
+; RV32ZVBC32-NEXT:    vlse64.v v20, (s6), zero
+; RV32ZVBC32-NEXT:    vlse64.v v24, (s8), zero
+; RV32ZVBC32-NEXT:    vlse64.v v28, (a0), zero
+; RV32ZVBC32-NEXT:    vand.vv v16, v12, v16
+; RV32ZVBC32-NEXT:    addi a0, sp, 288
+; RV32ZVBC32-NEXT:    vs4r.v v16, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v16, v12, v20
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a2, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a2
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v16, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v24, v12, v24
+; RV32ZVBC32-NEXT:    vand.vv v20, v12, v28
+; RV32ZVBC32-NEXT:    vand.vx v12, v12, a1
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v16, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v4
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    addi a0, sp, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    vmul.vv v8, v8, v20
+; RV32ZVBC32-NEXT:    vxor.vv v8, v12, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add sp, sp, a0
+; RV32ZVBC32-NEXT:    lw ra, 348(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s0, 344(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s1, 340(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s2, 336(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s3, 332(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s4, 328(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s5, 324(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s6, 320(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s7, 316(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s8, 312(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s9, 308(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s10, 304(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s11, 300(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    addi sp, sp, 352
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmul_nxv4i64_vv:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV64ZVBC32-NEXT:    vand.vi v16, v12, 2
+; RV64ZVBC32-NEXT:    vand.vi v20, v12, 1
+; RV64ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v20, v16
+; RV64ZVBC32-NEXT:    vand.vi v20, v12, 4
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vi v20, v12, 8
+; RV64ZVBC32-NEXT:    li a0, 16
+; RV64ZVBC32-NEXT:    li a1, 32
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC32-NEXT:    li a0, 64
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a1
+; RV64ZVBC32-NEXT:    li a1, 128
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC32-NEXT:    li a0, 256
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a1
+; RV64ZVBC32-NEXT:    li a1, 512
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC32-NEXT:    li a2, 1024
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a1
+; RV64ZVBC32-NEXT:    li a0, 1
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a2
+; RV64ZVBC32-NEXT:    slli a1, a0, 11
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a1
+; RV64ZVBC32-NEXT:    lui a1, 1
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a1
+; RV64ZVBC32-NEXT:    lui a1, 2
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a1
+; RV64ZVBC32-NEXT:    lui a1, 4
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a1
+; RV64ZVBC32-NEXT:    lui a1, 8
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a1
+; RV64ZVBC32-NEXT:    lui a1, 16
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a1
+; RV64ZVBC32-NEXT:    lui a1, 32
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a1
+; RV64ZVBC32-NEXT:    lui a1, 64
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a1
+; RV64ZVBC32-NEXT:    lui a1, 128
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a1
+; RV64ZVBC32-NEXT:    lui a1, 256
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a1
+; RV64ZVBC32-NEXT:    lui a1, 512
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a1
+; RV64ZVBC32-NEXT:    lui a1, 1024
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a1
+; RV64ZVBC32-NEXT:    lui a1, 2048
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a1
+; RV64ZVBC32-NEXT:    lui a1, 4096
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a1
+; RV64ZVBC32-NEXT:    lui a1, 8192
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a1
+; RV64ZVBC32-NEXT:    lui a1, 16384
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a1
+; RV64ZVBC32-NEXT:    lui a1, 32768
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a1
+; RV64ZVBC32-NEXT:    lui a1, 65536
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a1
+; RV64ZVBC32-NEXT:    lui a1, 131072
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a1
+; RV64ZVBC32-NEXT:    lui a1, 262144
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 31
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 32
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 33
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 34
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 35
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 36
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 37
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 38
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 39
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 40
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 41
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 42
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 43
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 44
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 45
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 46
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 47
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 48
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 49
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 50
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 51
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 52
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 53
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 54
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 55
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 56
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 57
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 58
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 59
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 60
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 61
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a1
+; RV64ZVBC32-NEXT:    li a1, -1
+; RV64ZVBC32-NEXT:    slli a0, a0, 62
+; RV64ZVBC32-NEXT:    slli a1, a1, 63
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC32-NEXT:    vand.vx v12, v12, a1
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vmul.vv v8, v8, v12
+; RV64ZVBC32-NEXT:    vxor.vv v8, v16, v8
+; RV64ZVBC32-NEXT:    ret
   %v = call <vscale x 4 x i64> @llvm.clmul.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %vb)
   ret <vscale x 4 x i64> %v
 }
@@ -11592,23 +20228,1220 @@ define <vscale x 4 x i64> @clmul_nxv4i64_vx(<vscale x 4 x i64> %va, i64 %b) noun
 ; RV64V-NEXT:    vxor.vv v8, v12, v8
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmul_nxv4i64_vx:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    addi sp, sp, -16
-; RV32ZVBC-NEXT:    sw a0, 8(sp)
-; RV32ZVBC-NEXT:    sw a1, 12(sp)
-; RV32ZVBC-NEXT:    addi a0, sp, 8
-; RV32ZVBC-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32ZVBC-NEXT:    vlse64.v v12, (a0), zero
-; RV32ZVBC-NEXT:    vclmul.vv v8, v8, v12
-; RV32ZVBC-NEXT:    addi sp, sp, 16
-; RV32ZVBC-NEXT:    ret
+; RV32ZVBC64-LABEL: clmul_nxv4i64_vx:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    addi sp, sp, -16
+; RV32ZVBC64-NEXT:    sw a0, 8(sp)
+; RV32ZVBC64-NEXT:    sw a1, 12(sp)
+; RV32ZVBC64-NEXT:    addi a0, sp, 8
+; RV32ZVBC64-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32ZVBC64-NEXT:    vlse64.v v12, (a0), zero
+; RV32ZVBC64-NEXT:    vclmul.vv v8, v8, v12
+; RV32ZVBC64-NEXT:    addi sp, sp, 16
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmul_nxv4i64_vx:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV64ZVBC64-NEXT:    vclmul.vx v8, v8, a0
+; RV64ZVBC64-NEXT:    ret
+;
+; RV32ZVBC32-LABEL: clmul_nxv4i64_vx:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    addi sp, sp, -352
+; RV32ZVBC32-NEXT:    sw ra, 348(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s0, 344(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s1, 340(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s2, 336(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s3, 332(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s4, 328(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s5, 324(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s6, 320(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s7, 316(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s8, 312(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s9, 308(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s10, 304(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s11, 300(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a2, vlenb
+; RV32ZVBC32-NEXT:    slli a2, a2, 3
+; RV32ZVBC32-NEXT:    mv a3, a2
+; RV32ZVBC32-NEXT:    slli a2, a2, 1
+; RV32ZVBC32-NEXT:    add a3, a3, a2
+; RV32ZVBC32-NEXT:    slli a2, a2, 1
+; RV32ZVBC32-NEXT:    add a3, a3, a2
+; RV32ZVBC32-NEXT:    slli a2, a2, 1
+; RV32ZVBC32-NEXT:    add a2, a2, a3
+; RV32ZVBC32-NEXT:    sub sp, sp, a2
+; RV32ZVBC32-NEXT:    sw a0, 8(sp)
+; RV32ZVBC32-NEXT:    sw a1, 12(sp)
+; RV32ZVBC32-NEXT:    addi a1, sp, 8
+; RV32ZVBC32-NEXT:    lui a2, 524288
+; RV32ZVBC32-NEXT:    li s11, 1
+; RV32ZVBC32-NEXT:    li a0, 2
+; RV32ZVBC32-NEXT:    li s9, 4
+; RV32ZVBC32-NEXT:    li s10, 8
+; RV32ZVBC32-NEXT:    li ra, 64
+; RV32ZVBC32-NEXT:    li s8, 256
+; RV32ZVBC32-NEXT:    li s7, 512
+; RV32ZVBC32-NEXT:    li s6, 1024
+; RV32ZVBC32-NEXT:    lui s5, 1
+; RV32ZVBC32-NEXT:    lui s4, 2
+; RV32ZVBC32-NEXT:    lui s3, 4
+; RV32ZVBC32-NEXT:    lui s2, 8
+; RV32ZVBC32-NEXT:    lui s1, 16
+; RV32ZVBC32-NEXT:    lui s0, 32
+; RV32ZVBC32-NEXT:    lui t6, 64
+; RV32ZVBC32-NEXT:    lui t5, 128
+; RV32ZVBC32-NEXT:    lui t4, 256
+; RV32ZVBC32-NEXT:    lui t2, 512
+; RV32ZVBC32-NEXT:    lui t1, 1024
+; RV32ZVBC32-NEXT:    lui t0, 2048
+; RV32ZVBC32-NEXT:    lui a7, 4096
+; RV32ZVBC32-NEXT:    lui a6, 8192
+; RV32ZVBC32-NEXT:    lui a5, 16384
+; RV32ZVBC32-NEXT:    vsetvli a4, zero, e64, m4, ta, ma
+; RV32ZVBC32-NEXT:    vlse64.v v12, (a1), zero
+; RV32ZVBC32-NEXT:    lui a3, 32768
+; RV32ZVBC32-NEXT:    sw a2, 16(sp)
+; RV32ZVBC32-NEXT:    lui t3, 524288
+; RV32ZVBC32-NEXT:    sw zero, 20(sp)
+; RV32ZVBC32-NEXT:    sw zero, 272(sp)
+; RV32ZVBC32-NEXT:    sw s11, 276(sp)
+; RV32ZVBC32-NEXT:    sw zero, 264(sp)
+; RV32ZVBC32-NEXT:    sw a0, 268(sp)
+; RV32ZVBC32-NEXT:    lui a2, 65536
+; RV32ZVBC32-NEXT:    sw zero, 256(sp)
+; RV32ZVBC32-NEXT:    sw s9, 260(sp)
+; RV32ZVBC32-NEXT:    lui a1, 131072
+; RV32ZVBC32-NEXT:    sw zero, 248(sp)
+; RV32ZVBC32-NEXT:    sw s10, 252(sp)
+; RV32ZVBC32-NEXT:    lui a0, 262144
+; RV32ZVBC32-NEXT:    sw zero, 240(sp)
+; RV32ZVBC32-NEXT:    li a4, 16
+; RV32ZVBC32-NEXT:    sw a4, 244(sp)
+; RV32ZVBC32-NEXT:    li s10, 16
+; RV32ZVBC32-NEXT:    sw zero, 232(sp)
+; RV32ZVBC32-NEXT:    li a4, 32
+; RV32ZVBC32-NEXT:    sw a4, 236(sp)
+; RV32ZVBC32-NEXT:    li a4, 32
+; RV32ZVBC32-NEXT:    sw zero, 224(sp)
+; RV32ZVBC32-NEXT:    sw ra, 228(sp)
+; RV32ZVBC32-NEXT:    li ra, 64
+; RV32ZVBC32-NEXT:    sw zero, 216(sp)
+; RV32ZVBC32-NEXT:    li s9, 128
+; RV32ZVBC32-NEXT:    sw s9, 220(sp)
+; RV32ZVBC32-NEXT:    li s9, 128
+; RV32ZVBC32-NEXT:    sw zero, 208(sp)
+; RV32ZVBC32-NEXT:    sw s8, 212(sp)
+; RV32ZVBC32-NEXT:    li s8, 256
+; RV32ZVBC32-NEXT:    sw zero, 200(sp)
+; RV32ZVBC32-NEXT:    sw s7, 204(sp)
+; RV32ZVBC32-NEXT:    sw zero, 192(sp)
+; RV32ZVBC32-NEXT:    sw s6, 196(sp)
+; RV32ZVBC32-NEXT:    slli s11, s11, 11
+; RV32ZVBC32-NEXT:    sw zero, 184(sp)
+; RV32ZVBC32-NEXT:    sw s11, 188(sp)
+; RV32ZVBC32-NEXT:    sw zero, 176(sp)
+; RV32ZVBC32-NEXT:    sw s5, 180(sp)
+; RV32ZVBC32-NEXT:    sw zero, 168(sp)
+; RV32ZVBC32-NEXT:    sw s4, 172(sp)
+; RV32ZVBC32-NEXT:    sw zero, 160(sp)
+; RV32ZVBC32-NEXT:    sw s3, 164(sp)
+; RV32ZVBC32-NEXT:    sw zero, 152(sp)
+; RV32ZVBC32-NEXT:    sw s2, 156(sp)
+; RV32ZVBC32-NEXT:    sw zero, 144(sp)
+; RV32ZVBC32-NEXT:    sw s1, 148(sp)
+; RV32ZVBC32-NEXT:    sw zero, 136(sp)
+; RV32ZVBC32-NEXT:    sw s0, 140(sp)
+; RV32ZVBC32-NEXT:    sw zero, 128(sp)
+; RV32ZVBC32-NEXT:    sw t6, 132(sp)
+; RV32ZVBC32-NEXT:    sw zero, 120(sp)
+; RV32ZVBC32-NEXT:    sw t5, 124(sp)
+; RV32ZVBC32-NEXT:    sw zero, 112(sp)
+; RV32ZVBC32-NEXT:    sw t4, 116(sp)
+; RV32ZVBC32-NEXT:    sw zero, 104(sp)
+; RV32ZVBC32-NEXT:    sw t2, 108(sp)
+; RV32ZVBC32-NEXT:    sw zero, 96(sp)
+; RV32ZVBC32-NEXT:    sw t1, 100(sp)
+; RV32ZVBC32-NEXT:    sw zero, 88(sp)
+; RV32ZVBC32-NEXT:    sw t0, 92(sp)
+; RV32ZVBC32-NEXT:    sw zero, 80(sp)
+; RV32ZVBC32-NEXT:    sw a7, 84(sp)
+; RV32ZVBC32-NEXT:    sw zero, 72(sp)
+; RV32ZVBC32-NEXT:    sw a6, 76(sp)
+; RV32ZVBC32-NEXT:    sw zero, 64(sp)
+; RV32ZVBC32-NEXT:    sw a5, 68(sp)
+; RV32ZVBC32-NEXT:    sw zero, 56(sp)
+; RV32ZVBC32-NEXT:    sw a3, 60(sp)
+; RV32ZVBC32-NEXT:    sw zero, 48(sp)
+; RV32ZVBC32-NEXT:    sw a2, 52(sp)
+; RV32ZVBC32-NEXT:    sw zero, 40(sp)
+; RV32ZVBC32-NEXT:    sw a1, 44(sp)
+; RV32ZVBC32-NEXT:    sw zero, 32(sp)
+; RV32ZVBC32-NEXT:    sw a0, 36(sp)
+; RV32ZVBC32-NEXT:    sw zero, 24(sp)
+; RV32ZVBC32-NEXT:    sw t3, 28(sp)
+; RV32ZVBC32-NEXT:    addi a0, sp, 16
+; RV32ZVBC32-NEXT:    vlse64.v v16, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 272
+; RV32ZVBC32-NEXT:    vlse64.v v28, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 264
+; RV32ZVBC32-NEXT:    vlse64.v v20, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 256
+; RV32ZVBC32-NEXT:    vlse64.v v24, (a0), zero
+; RV32ZVBC32-NEXT:    vand.vi v4, v12, 2
+; RV32ZVBC32-NEXT:    vand.vi v0, v12, 1
+; RV32ZVBC32-NEXT:    vmul.vv v4, v8, v4
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v4, v0, v4
+; RV32ZVBC32-NEXT:    vand.vi v0, v12, 4
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v4, v4, v0
+; RV32ZVBC32-NEXT:    vand.vi v0, v12, 8
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v4, v4, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v12, s10
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v4, v4, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v12, a4
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v4, v4, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v12, ra
+; RV32ZVBC32-NEXT:    addi s10, sp, 224
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v4, v4, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v12, s9
+; RV32ZVBC32-NEXT:    addi s7, sp, 216
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v4, v4, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v12, s8
+; RV32ZVBC32-NEXT:    addi s6, sp, 208
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v4, v4, v0
+; RV32ZVBC32-NEXT:    li a0, 512
+; RV32ZVBC32-NEXT:    vand.vx v0, v12, a0
+; RV32ZVBC32-NEXT:    addi s5, sp, 200
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v4, v4, v0
+; RV32ZVBC32-NEXT:    li a0, 1024
+; RV32ZVBC32-NEXT:    vand.vx v0, v12, a0
+; RV32ZVBC32-NEXT:    addi s4, sp, 192
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v4, v4, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v12, s11
+; RV32ZVBC32-NEXT:    addi s11, sp, 184
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v4, v4, v0
+; RV32ZVBC32-NEXT:    lui a0, 1
+; RV32ZVBC32-NEXT:    vand.vx v0, v12, a0
+; RV32ZVBC32-NEXT:    addi s3, sp, 176
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v4, v4, v0
+; RV32ZVBC32-NEXT:    lui a0, 2
+; RV32ZVBC32-NEXT:    vand.vx v0, v12, a0
+; RV32ZVBC32-NEXT:    addi s2, sp, 168
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v4, v4, v0
+; RV32ZVBC32-NEXT:    lui a0, 4
+; RV32ZVBC32-NEXT:    vand.vx v0, v12, a0
+; RV32ZVBC32-NEXT:    addi s1, sp, 160
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v4, v4, v0
+; RV32ZVBC32-NEXT:    lui a0, 8
+; RV32ZVBC32-NEXT:    vand.vx v0, v12, a0
+; RV32ZVBC32-NEXT:    addi s0, sp, 152
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v4, v4, v0
+; RV32ZVBC32-NEXT:    lui a0, 16
+; RV32ZVBC32-NEXT:    vand.vx v0, v12, a0
+; RV32ZVBC32-NEXT:    addi t6, sp, 144
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v4, v4, v0
+; RV32ZVBC32-NEXT:    lui a0, 32
+; RV32ZVBC32-NEXT:    vand.vx v0, v12, a0
+; RV32ZVBC32-NEXT:    addi t5, sp, 136
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v4, v4, v0
+; RV32ZVBC32-NEXT:    lui a0, 64
+; RV32ZVBC32-NEXT:    vand.vx v0, v12, a0
+; RV32ZVBC32-NEXT:    addi t4, sp, 128
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v4, v4, v0
+; RV32ZVBC32-NEXT:    lui a0, 128
+; RV32ZVBC32-NEXT:    vand.vx v0, v12, a0
+; RV32ZVBC32-NEXT:    addi t3, sp, 120
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v4, v4, v0
+; RV32ZVBC32-NEXT:    lui a0, 256
+; RV32ZVBC32-NEXT:    vand.vx v0, v12, a0
+; RV32ZVBC32-NEXT:    addi t2, sp, 112
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v4, v4, v0
+; RV32ZVBC32-NEXT:    lui a0, 512
+; RV32ZVBC32-NEXT:    vand.vx v0, v12, a0
+; RV32ZVBC32-NEXT:    addi t1, sp, 104
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v4, v4, v0
+; RV32ZVBC32-NEXT:    lui a0, 1024
+; RV32ZVBC32-NEXT:    vand.vx v0, v12, a0
+; RV32ZVBC32-NEXT:    addi t0, sp, 96
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v4, v4, v0
+; RV32ZVBC32-NEXT:    lui a0, 2048
+; RV32ZVBC32-NEXT:    vand.vx v0, v12, a0
+; RV32ZVBC32-NEXT:    addi a7, sp, 88
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v4, v4, v0
+; RV32ZVBC32-NEXT:    lui a0, 4096
+; RV32ZVBC32-NEXT:    vand.vx v0, v12, a0
+; RV32ZVBC32-NEXT:    addi a6, sp, 80
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v4, v4, v0
+; RV32ZVBC32-NEXT:    lui a0, 8192
+; RV32ZVBC32-NEXT:    vand.vx v0, v12, a0
+; RV32ZVBC32-NEXT:    addi a5, sp, 72
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v4, v4, v0
+; RV32ZVBC32-NEXT:    lui a0, 16384
+; RV32ZVBC32-NEXT:    vand.vx v0, v12, a0
+; RV32ZVBC32-NEXT:    addi a4, sp, 64
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v4, v4, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v12, a3
+; RV32ZVBC32-NEXT:    addi a2, sp, 56
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v4, v4, v0
+; RV32ZVBC32-NEXT:    lui a0, 65536
+; RV32ZVBC32-NEXT:    vand.vx v0, v12, a0
+; RV32ZVBC32-NEXT:    addi a3, sp, 48
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v4, v4, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v12, a1
+; RV32ZVBC32-NEXT:    addi a1, sp, 40
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v4, v4, v0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv s8, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add s8, s8, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add s8, s8, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, s8
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v4, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi a0, sp, 248
+; RV32ZVBC32-NEXT:    vlse64.v v4, (a0), zero
+; RV32ZVBC32-NEXT:    addi ra, sp, 32
+; RV32ZVBC32-NEXT:    vand.vv v16, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv s8, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, s8
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v16, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v16, v12, v28
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv s8, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add s8, s8, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, s8
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v16, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v16, v12, v20
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv s8, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add s8, s8, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, s8
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v16, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v16, v12, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv s8, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add s8, s8, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add s8, s8, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, s8
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v16, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v16, v12, v4
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv s8, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add s8, s8, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, s8
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v16, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi a0, sp, 24
+; RV32ZVBC32-NEXT:    addi s8, sp, 240
+; RV32ZVBC32-NEXT:    vlse64.v v16, (s8), zero
+; RV32ZVBC32-NEXT:    addi s8, sp, 232
+; RV32ZVBC32-NEXT:    vlse64.v v20, (s8), zero
+; RV32ZVBC32-NEXT:    vlse64.v v24, (s10), zero
+; RV32ZVBC32-NEXT:    vlse64.v v28, (s7), zero
+; RV32ZVBC32-NEXT:    vand.vv v16, v12, v16
+; RV32ZVBC32-NEXT:    csrr s7, vlenb
+; RV32ZVBC32-NEXT:    slli s7, s7, 4
+; RV32ZVBC32-NEXT:    add s7, sp, s7
+; RV32ZVBC32-NEXT:    addi s7, s7, 288
+; RV32ZVBC32-NEXT:    vs4r.v v16, (s7) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v16, v12, v20
+; RV32ZVBC32-NEXT:    csrr s7, vlenb
+; RV32ZVBC32-NEXT:    slli s7, s7, 2
+; RV32ZVBC32-NEXT:    mv s8, s7
+; RV32ZVBC32-NEXT:    slli s7, s7, 1
+; RV32ZVBC32-NEXT:    add s8, s8, s7
+; RV32ZVBC32-NEXT:    slli s7, s7, 2
+; RV32ZVBC32-NEXT:    add s7, s7, s8
+; RV32ZVBC32-NEXT:    add s7, sp, s7
+; RV32ZVBC32-NEXT:    addi s7, s7, 288
+; RV32ZVBC32-NEXT:    vs4r.v v16, (s7) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v16, v12, v24
+; RV32ZVBC32-NEXT:    csrr s7, vlenb
+; RV32ZVBC32-NEXT:    slli s7, s7, 2
+; RV32ZVBC32-NEXT:    mv s8, s7
+; RV32ZVBC32-NEXT:    slli s7, s7, 4
+; RV32ZVBC32-NEXT:    add s7, s7, s8
+; RV32ZVBC32-NEXT:    add s7, sp, s7
+; RV32ZVBC32-NEXT:    addi s7, s7, 288
+; RV32ZVBC32-NEXT:    vs4r.v v16, (s7) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v16, v12, v28
+; RV32ZVBC32-NEXT:    csrr s7, vlenb
+; RV32ZVBC32-NEXT:    slli s7, s7, 2
+; RV32ZVBC32-NEXT:    mv s8, s7
+; RV32ZVBC32-NEXT:    slli s7, s7, 1
+; RV32ZVBC32-NEXT:    add s8, s8, s7
+; RV32ZVBC32-NEXT:    slli s7, s7, 1
+; RV32ZVBC32-NEXT:    add s8, s8, s7
+; RV32ZVBC32-NEXT:    slli s7, s7, 2
+; RV32ZVBC32-NEXT:    add s7, s7, s8
+; RV32ZVBC32-NEXT:    add s7, sp, s7
+; RV32ZVBC32-NEXT:    addi s7, s7, 288
+; RV32ZVBC32-NEXT:    vs4r.v v16, (s7) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v20, (s6), zero
+; RV32ZVBC32-NEXT:    vlse64.v v24, (s5), zero
+; RV32ZVBC32-NEXT:    vlse64.v v28, (s4), zero
+; RV32ZVBC32-NEXT:    vlse64.v v4, (s11), zero
+; RV32ZVBC32-NEXT:    vand.vv v16, v12, v20
+; RV32ZVBC32-NEXT:    csrr s4, vlenb
+; RV32ZVBC32-NEXT:    slli s4, s4, 2
+; RV32ZVBC32-NEXT:    mv s5, s4
+; RV32ZVBC32-NEXT:    slli s4, s4, 1
+; RV32ZVBC32-NEXT:    add s4, s4, s5
+; RV32ZVBC32-NEXT:    add s4, sp, s4
+; RV32ZVBC32-NEXT:    addi s4, s4, 288
+; RV32ZVBC32-NEXT:    vs4r.v v16, (s4) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v16, v12, v24
+; RV32ZVBC32-NEXT:    csrr s4, vlenb
+; RV32ZVBC32-NEXT:    slli s4, s4, 3
+; RV32ZVBC32-NEXT:    mv s5, s4
+; RV32ZVBC32-NEXT:    slli s4, s4, 2
+; RV32ZVBC32-NEXT:    add s4, s4, s5
+; RV32ZVBC32-NEXT:    add s4, sp, s4
+; RV32ZVBC32-NEXT:    addi s4, s4, 288
+; RV32ZVBC32-NEXT:    vs4r.v v16, (s4) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v16, v12, v28
+; RV32ZVBC32-NEXT:    csrr s4, vlenb
+; RV32ZVBC32-NEXT:    slli s4, s4, 6
+; RV32ZVBC32-NEXT:    add s4, sp, s4
+; RV32ZVBC32-NEXT:    addi s4, s4, 288
+; RV32ZVBC32-NEXT:    vs4r.v v16, (s4) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v16, v12, v4
+; RV32ZVBC32-NEXT:    csrr s4, vlenb
+; RV32ZVBC32-NEXT:    slli s4, s4, 3
+; RV32ZVBC32-NEXT:    mv s5, s4
+; RV32ZVBC32-NEXT:    slli s4, s4, 1
+; RV32ZVBC32-NEXT:    add s5, s5, s4
+; RV32ZVBC32-NEXT:    slli s4, s4, 2
+; RV32ZVBC32-NEXT:    add s4, s4, s5
+; RV32ZVBC32-NEXT:    add s4, sp, s4
+; RV32ZVBC32-NEXT:    addi s4, s4, 288
+; RV32ZVBC32-NEXT:    vs4r.v v16, (s4) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v24, (s3), zero
+; RV32ZVBC32-NEXT:    vlse64.v v28, (s2), zero
+; RV32ZVBC32-NEXT:    vlse64.v v4, (s1), zero
+; RV32ZVBC32-NEXT:    vlse64.v v0, (s0), zero
+; RV32ZVBC32-NEXT:    vand.vv v16, v12, v24
+; RV32ZVBC32-NEXT:    csrr s0, vlenb
+; RV32ZVBC32-NEXT:    slli s0, s0, 3
+; RV32ZVBC32-NEXT:    add s0, sp, s0
+; RV32ZVBC32-NEXT:    addi s0, s0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v16, (s0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v16, v12, v28
+; RV32ZVBC32-NEXT:    csrr s0, vlenb
+; RV32ZVBC32-NEXT:    slli s0, s0, 2
+; RV32ZVBC32-NEXT:    mv s1, s0
+; RV32ZVBC32-NEXT:    slli s0, s0, 3
+; RV32ZVBC32-NEXT:    add s0, s0, s1
+; RV32ZVBC32-NEXT:    add s0, sp, s0
+; RV32ZVBC32-NEXT:    addi s0, s0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v16, (s0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v16, v12, v4
+; RV32ZVBC32-NEXT:    csrr s0, vlenb
+; RV32ZVBC32-NEXT:    slli s0, s0, 2
+; RV32ZVBC32-NEXT:    mv s1, s0
+; RV32ZVBC32-NEXT:    slli s0, s0, 1
+; RV32ZVBC32-NEXT:    add s1, s1, s0
+; RV32ZVBC32-NEXT:    slli s0, s0, 1
+; RV32ZVBC32-NEXT:    add s1, s1, s0
+; RV32ZVBC32-NEXT:    slli s0, s0, 1
+; RV32ZVBC32-NEXT:    add s0, s0, s1
+; RV32ZVBC32-NEXT:    add s0, sp, s0
+; RV32ZVBC32-NEXT:    addi s0, s0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v16, (s0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v16, v12, v0
+; RV32ZVBC32-NEXT:    csrr s0, vlenb
+; RV32ZVBC32-NEXT:    slli s0, s0, 2
+; RV32ZVBC32-NEXT:    mv s1, s0
+; RV32ZVBC32-NEXT:    slli s0, s0, 2
+; RV32ZVBC32-NEXT:    add s1, s1, s0
+; RV32ZVBC32-NEXT:    slli s0, s0, 2
+; RV32ZVBC32-NEXT:    add s0, s0, s1
+; RV32ZVBC32-NEXT:    add s0, sp, s0
+; RV32ZVBC32-NEXT:    addi s0, s0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v16, (s0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v28, (t6), zero
+; RV32ZVBC32-NEXT:    vlse64.v v4, (t5), zero
+; RV32ZVBC32-NEXT:    vlse64.v v0, (t4), zero
+; RV32ZVBC32-NEXT:    vlse64.v v16, (t3), zero
+; RV32ZVBC32-NEXT:    vand.vv v20, v12, v28
+; RV32ZVBC32-NEXT:    csrr t3, vlenb
+; RV32ZVBC32-NEXT:    slli t3, t3, 2
+; RV32ZVBC32-NEXT:    add t3, sp, t3
+; RV32ZVBC32-NEXT:    addi t3, t3, 288
+; RV32ZVBC32-NEXT:    vs4r.v v20, (t3) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v20, v12, v4
+; RV32ZVBC32-NEXT:    csrr t3, vlenb
+; RV32ZVBC32-NEXT:    slli t3, t3, 5
+; RV32ZVBC32-NEXT:    add t3, sp, t3
+; RV32ZVBC32-NEXT:    addi t3, t3, 288
+; RV32ZVBC32-NEXT:    vs4r.v v20, (t3) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v20, v12, v0
+; RV32ZVBC32-NEXT:    csrr t3, vlenb
+; RV32ZVBC32-NEXT:    slli t3, t3, 3
+; RV32ZVBC32-NEXT:    mv t4, t3
+; RV32ZVBC32-NEXT:    slli t3, t3, 1
+; RV32ZVBC32-NEXT:    add t4, t4, t3
+; RV32ZVBC32-NEXT:    slli t3, t3, 1
+; RV32ZVBC32-NEXT:    add t3, t3, t4
+; RV32ZVBC32-NEXT:    add t3, sp, t3
+; RV32ZVBC32-NEXT:    addi t3, t3, 288
+; RV32ZVBC32-NEXT:    vs4r.v v20, (t3) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v16, v12, v16
+; RV32ZVBC32-NEXT:    csrr t3, vlenb
+; RV32ZVBC32-NEXT:    slli t3, t3, 4
+; RV32ZVBC32-NEXT:    mv t4, t3
+; RV32ZVBC32-NEXT:    slli t3, t3, 2
+; RV32ZVBC32-NEXT:    add t3, t3, t4
+; RV32ZVBC32-NEXT:    add t3, sp, t3
+; RV32ZVBC32-NEXT:    addi t3, t3, 288
+; RV32ZVBC32-NEXT:    vs4r.v v16, (t3) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v16, (t2), zero
+; RV32ZVBC32-NEXT:    vlse64.v v0, (t1), zero
+; RV32ZVBC32-NEXT:    vlse64.v v20, (t0), zero
+; RV32ZVBC32-NEXT:    vlse64.v v24, (a7), zero
+; RV32ZVBC32-NEXT:    vand.vv v4, v12, v16
+; RV32ZVBC32-NEXT:    vand.vv v16, v12, v0
+; RV32ZVBC32-NEXT:    csrr a7, vlenb
+; RV32ZVBC32-NEXT:    slli a7, a7, 2
+; RV32ZVBC32-NEXT:    mv t0, a7
+; RV32ZVBC32-NEXT:    slli a7, a7, 1
+; RV32ZVBC32-NEXT:    add t0, t0, a7
+; RV32ZVBC32-NEXT:    slli a7, a7, 1
+; RV32ZVBC32-NEXT:    add a7, a7, t0
+; RV32ZVBC32-NEXT:    add a7, sp, a7
+; RV32ZVBC32-NEXT:    addi a7, a7, 288
+; RV32ZVBC32-NEXT:    vs4r.v v16, (a7) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v16, v12, v20
+; RV32ZVBC32-NEXT:    csrr a7, vlenb
+; RV32ZVBC32-NEXT:    slli a7, a7, 2
+; RV32ZVBC32-NEXT:    mv t0, a7
+; RV32ZVBC32-NEXT:    slli a7, a7, 2
+; RV32ZVBC32-NEXT:    add t0, t0, a7
+; RV32ZVBC32-NEXT:    slli a7, a7, 1
+; RV32ZVBC32-NEXT:    add a7, a7, t0
+; RV32ZVBC32-NEXT:    add a7, sp, a7
+; RV32ZVBC32-NEXT:    addi a7, a7, 288
+; RV32ZVBC32-NEXT:    vs4r.v v16, (a7) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v16, v12, v24
+; RV32ZVBC32-NEXT:    csrr a7, vlenb
+; RV32ZVBC32-NEXT:    slli a7, a7, 2
+; RV32ZVBC32-NEXT:    mv t0, a7
+; RV32ZVBC32-NEXT:    slli a7, a7, 1
+; RV32ZVBC32-NEXT:    add t0, t0, a7
+; RV32ZVBC32-NEXT:    slli a7, a7, 3
+; RV32ZVBC32-NEXT:    add a7, a7, t0
+; RV32ZVBC32-NEXT:    add a7, sp, a7
+; RV32ZVBC32-NEXT:    addi a7, a7, 288
+; RV32ZVBC32-NEXT:    vs4r.v v16, (a7) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v16, (a6), zero
+; RV32ZVBC32-NEXT:    vlse64.v v20, (a5), zero
+; RV32ZVBC32-NEXT:    vlse64.v v24, (a4), zero
+; RV32ZVBC32-NEXT:    vlse64.v v28, (a2), zero
+; RV32ZVBC32-NEXT:    vand.vv v0, v12, v16
+; RV32ZVBC32-NEXT:    vand.vv v16, v12, v20
+; RV32ZVBC32-NEXT:    csrr a2, vlenb
+; RV32ZVBC32-NEXT:    slli a2, a2, 3
+; RV32ZVBC32-NEXT:    mv a4, a2
+; RV32ZVBC32-NEXT:    slli a2, a2, 1
+; RV32ZVBC32-NEXT:    add a2, a2, a4
+; RV32ZVBC32-NEXT:    add a2, sp, a2
+; RV32ZVBC32-NEXT:    addi a2, a2, 288
+; RV32ZVBC32-NEXT:    vs4r.v v16, (a2) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v16, v12, v24
+; RV32ZVBC32-NEXT:    csrr a2, vlenb
+; RV32ZVBC32-NEXT:    slli a2, a2, 4
+; RV32ZVBC32-NEXT:    mv a4, a2
+; RV32ZVBC32-NEXT:    slli a2, a2, 1
+; RV32ZVBC32-NEXT:    add a2, a2, a4
+; RV32ZVBC32-NEXT:    add a2, sp, a2
+; RV32ZVBC32-NEXT:    addi a2, a2, 288
+; RV32ZVBC32-NEXT:    vs4r.v v16, (a2) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v16, v12, v28
+; RV32ZVBC32-NEXT:    csrr a2, vlenb
+; RV32ZVBC32-NEXT:    slli a2, a2, 3
+; RV32ZVBC32-NEXT:    mv a4, a2
+; RV32ZVBC32-NEXT:    slli a2, a2, 3
+; RV32ZVBC32-NEXT:    add a2, a2, a4
+; RV32ZVBC32-NEXT:    add a2, sp, a2
+; RV32ZVBC32-NEXT:    addi a2, a2, 288
+; RV32ZVBC32-NEXT:    vs4r.v v16, (a2) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v16, (a3), zero
+; RV32ZVBC32-NEXT:    vlse64.v v20, (a1), zero
+; RV32ZVBC32-NEXT:    vlse64.v v24, (ra), zero
+; RV32ZVBC32-NEXT:    vlse64.v v28, (a0), zero
+; RV32ZVBC32-NEXT:    vand.vv v16, v12, v16
+; RV32ZVBC32-NEXT:    addi a0, sp, 288
+; RV32ZVBC32-NEXT:    vs4r.v v16, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v16, v12, v20
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v16, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v24, v12, v24
+; RV32ZVBC32-NEXT:    vand.vv v20, v12, v28
+; RV32ZVBC32-NEXT:    lui a0, 262144
+; RV32ZVBC32-NEXT:    vand.vx v12, v12, a0
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v16, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v4
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    addi a0, sp, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    vmul.vv v8, v8, v20
+; RV32ZVBC32-NEXT:    vxor.vv v8, v12, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add sp, sp, a0
+; RV32ZVBC32-NEXT:    lw ra, 348(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s0, 344(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s1, 340(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s2, 336(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s3, 332(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s4, 328(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s5, 324(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s6, 320(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s7, 316(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s8, 312(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s9, 308(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s10, 304(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s11, 300(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    addi sp, sp, 352
+; RV32ZVBC32-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmul_nxv4i64_vx:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV64ZVBC-NEXT:    vclmul.vx v8, v8, a0
-; RV64ZVBC-NEXT:    ret
+; RV64ZVBC32-LABEL: clmul_nxv4i64_vx:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    andi a1, a0, 2
+; RV64ZVBC32-NEXT:    andi a2, a0, 1
+; RV64ZVBC32-NEXT:    vsetvli a3, zero, e64, m4, ta, ma
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, a1
+; RV64ZVBC32-NEXT:    andi a1, a0, 4
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, a2
+; RV64ZVBC32-NEXT:    andi a2, a0, 8
+; RV64ZVBC32-NEXT:    vxor.vv v12, v16, v12
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, a1
+; RV64ZVBC32-NEXT:    andi a1, a0, 16
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, a2
+; RV64ZVBC32-NEXT:    andi a2, a0, 32
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, a1
+; RV64ZVBC32-NEXT:    andi a1, a0, 64
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, a2
+; RV64ZVBC32-NEXT:    andi a2, a0, 128
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, a1
+; RV64ZVBC32-NEXT:    andi a1, a0, 256
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, a2
+; RV64ZVBC32-NEXT:    andi a2, a0, 512
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, a1
+; RV64ZVBC32-NEXT:    andi a3, a0, 1024
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, a2
+; RV64ZVBC32-NEXT:    li a1, 1
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, a3
+; RV64ZVBC32-NEXT:    slli a2, a1, 11
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 1
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 2
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 4
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 8
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 16
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 32
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 64
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 128
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 256
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 512
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 1024
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 2048
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 4096
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 8192
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 16384
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 32768
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 65536
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 131072
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 262144
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, a2
+; RV64ZVBC32-NEXT:    srliw a2, a0, 31
+; RV64ZVBC32-NEXT:    slli a2, a2, 31
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 32
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 33
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 34
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 35
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 36
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 37
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 38
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 39
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 40
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 41
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 42
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 43
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 44
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 45
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 46
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 47
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 48
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 49
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 50
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 51
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 52
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 53
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 54
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 55
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 56
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 57
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 58
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 59
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 60
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 61
+; RV64ZVBC32-NEXT:    slli a1, a1, 62
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    and a1, a0, a1
+; RV64ZVBC32-NEXT:    srli a0, a0, 63
+; RV64ZVBC32-NEXT:    slli a0, a0, 63
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, a2
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, a1
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    vmul.vx v8, v8, a0
+; RV64ZVBC32-NEXT:    vxor.vv v8, v12, v8
+; RV64ZVBC32-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x i64> poison, i64 %b, i32 0
   %vb = shufflevector <vscale x 4 x i64> %elt.head, <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
   %v = call <vscale x 4 x i64> @llvm.clmul.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %vb)
@@ -13322,17 +23155,1723 @@ define <vscale x 8 x i64> @clmul_nxv8i64_vv(<vscale x 8 x i64> %va, <vscale x 8
 ; RV64V-NEXT:    vxor.vv v8, v24, v8
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmul_nxv8i64_vv:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV32ZVBC-NEXT:    vclmul.vv v8, v8, v16
-; RV32ZVBC-NEXT:    ret
+; RV32ZVBC64-LABEL: clmul_nxv8i64_vv:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV32ZVBC64-NEXT:    vclmul.vv v8, v8, v16
+; RV32ZVBC64-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmul_nxv8i64_vv:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV64ZVBC-NEXT:    vclmul.vv v8, v8, v16
-; RV64ZVBC-NEXT:    ret
+; RV64ZVBC64-LABEL: clmul_nxv8i64_vv:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV64ZVBC64-NEXT:    vclmul.vv v8, v8, v16
+; RV64ZVBC64-NEXT:    ret
+;
+; RV32ZVBC32-LABEL: clmul_nxv8i64_vv:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    addi sp, sp, -352
+; RV32ZVBC32-NEXT:    sw ra, 348(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s0, 344(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s1, 340(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s2, 336(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s3, 332(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s4, 328(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s5, 324(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s6, 320(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s7, 316(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s8, 312(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s9, 308(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s10, 304(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s11, 300(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    sub sp, sp, a0
+; RV32ZVBC32-NEXT:    lui a1, 524288
+; RV32ZVBC32-NEXT:    li s5, 1
+; RV32ZVBC32-NEXT:    li a3, 2
+; RV32ZVBC32-NEXT:    li a2, 4
+; RV32ZVBC32-NEXT:    li s10, 8
+; RV32ZVBC32-NEXT:    li a0, 16
+; RV32ZVBC32-NEXT:    li t6, 32
+; RV32ZVBC32-NEXT:    li s1, 64
+; RV32ZVBC32-NEXT:    li s3, 128
+; RV32ZVBC32-NEXT:    li s7, 256
+; RV32ZVBC32-NEXT:    li s4, 512
+; RV32ZVBC32-NEXT:    li s8, 1024
+; RV32ZVBC32-NEXT:    lui ra, 1
+; RV32ZVBC32-NEXT:    lui s11, 2
+; RV32ZVBC32-NEXT:    lui s9, 4
+; RV32ZVBC32-NEXT:    lui s6, 8
+; RV32ZVBC32-NEXT:    lui s2, 16
+; RV32ZVBC32-NEXT:    lui s0, 32
+; RV32ZVBC32-NEXT:    lui t5, 64
+; RV32ZVBC32-NEXT:    lui t4, 128
+; RV32ZVBC32-NEXT:    lui t3, 256
+; RV32ZVBC32-NEXT:    lui t2, 512
+; RV32ZVBC32-NEXT:    lui t1, 1024
+; RV32ZVBC32-NEXT:    lui t0, 2048
+; RV32ZVBC32-NEXT:    lui a7, 4096
+; RV32ZVBC32-NEXT:    lui a6, 8192
+; RV32ZVBC32-NEXT:    lui a5, 16384
+; RV32ZVBC32-NEXT:    lui a4, 32768
+; RV32ZVBC32-NEXT:    sw a1, 16(sp)
+; RV32ZVBC32-NEXT:    sw zero, 20(sp)
+; RV32ZVBC32-NEXT:    sw zero, 272(sp)
+; RV32ZVBC32-NEXT:    sw s5, 276(sp)
+; RV32ZVBC32-NEXT:    sw zero, 264(sp)
+; RV32ZVBC32-NEXT:    sw a3, 268(sp)
+; RV32ZVBC32-NEXT:    lui a3, 65536
+; RV32ZVBC32-NEXT:    sw zero, 256(sp)
+; RV32ZVBC32-NEXT:    sw a2, 260(sp)
+; RV32ZVBC32-NEXT:    lui a2, 131072
+; RV32ZVBC32-NEXT:    sw zero, 248(sp)
+; RV32ZVBC32-NEXT:    sw s10, 252(sp)
+; RV32ZVBC32-NEXT:    vsetvli s10, zero, e64, m8, ta, ma
+; RV32ZVBC32-NEXT:    vand.vi v24, v16, 2
+; RV32ZVBC32-NEXT:    vand.vi v0, v16, 1
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v0, v24
+; RV32ZVBC32-NEXT:    vand.vi v0, v16, 4
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vi v0, v16, 8
+; RV32ZVBC32-NEXT:    sw zero, 240(sp)
+; RV32ZVBC32-NEXT:    sw a0, 244(sp)
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v16, a0
+; RV32ZVBC32-NEXT:    addi s10, sp, 16
+; RV32ZVBC32-NEXT:    sw zero, 232(sp)
+; RV32ZVBC32-NEXT:    sw t6, 236(sp)
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v16, t6
+; RV32ZVBC32-NEXT:    sw zero, 224(sp)
+; RV32ZVBC32-NEXT:    sw s1, 228(sp)
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v16, s1
+; RV32ZVBC32-NEXT:    sw zero, 216(sp)
+; RV32ZVBC32-NEXT:    sw s3, 220(sp)
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v16, s3
+; RV32ZVBC32-NEXT:    sw zero, 208(sp)
+; RV32ZVBC32-NEXT:    sw s7, 212(sp)
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v16, s7
+; RV32ZVBC32-NEXT:    sw zero, 200(sp)
+; RV32ZVBC32-NEXT:    sw s4, 204(sp)
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v16, s4
+; RV32ZVBC32-NEXT:    sw zero, 192(sp)
+; RV32ZVBC32-NEXT:    sw s8, 196(sp)
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v16, s8
+; RV32ZVBC32-NEXT:    slli s5, s5, 11
+; RV32ZVBC32-NEXT:    sw zero, 184(sp)
+; RV32ZVBC32-NEXT:    sw s5, 188(sp)
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v16, s5
+; RV32ZVBC32-NEXT:    addi s5, sp, 224
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v16, ra
+; RV32ZVBC32-NEXT:    sw zero, 176(sp)
+; RV32ZVBC32-NEXT:    sw ra, 180(sp)
+; RV32ZVBC32-NEXT:    addi ra, sp, 216
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v16, s11
+; RV32ZVBC32-NEXT:    sw zero, 168(sp)
+; RV32ZVBC32-NEXT:    sw s11, 172(sp)
+; RV32ZVBC32-NEXT:    addi s11, sp, 208
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v16, s9
+; RV32ZVBC32-NEXT:    sw zero, 160(sp)
+; RV32ZVBC32-NEXT:    sw s9, 164(sp)
+; RV32ZVBC32-NEXT:    addi s9, sp, 200
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v16, s6
+; RV32ZVBC32-NEXT:    sw zero, 152(sp)
+; RV32ZVBC32-NEXT:    sw s6, 156(sp)
+; RV32ZVBC32-NEXT:    addi s6, sp, 192
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v16, s2
+; RV32ZVBC32-NEXT:    sw zero, 144(sp)
+; RV32ZVBC32-NEXT:    sw s2, 148(sp)
+; RV32ZVBC32-NEXT:    addi s3, sp, 184
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v16, s0
+; RV32ZVBC32-NEXT:    sw zero, 136(sp)
+; RV32ZVBC32-NEXT:    sw s0, 140(sp)
+; RV32ZVBC32-NEXT:    addi s4, sp, 176
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v16, t5
+; RV32ZVBC32-NEXT:    sw zero, 128(sp)
+; RV32ZVBC32-NEXT:    sw t5, 132(sp)
+; RV32ZVBC32-NEXT:    addi s2, sp, 168
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v16, t4
+; RV32ZVBC32-NEXT:    sw zero, 120(sp)
+; RV32ZVBC32-NEXT:    sw t4, 124(sp)
+; RV32ZVBC32-NEXT:    addi s1, sp, 160
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v16, t3
+; RV32ZVBC32-NEXT:    sw zero, 112(sp)
+; RV32ZVBC32-NEXT:    sw t3, 116(sp)
+; RV32ZVBC32-NEXT:    addi t6, sp, 152
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v16, t2
+; RV32ZVBC32-NEXT:    sw zero, 104(sp)
+; RV32ZVBC32-NEXT:    sw t2, 108(sp)
+; RV32ZVBC32-NEXT:    addi s0, sp, 144
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v16, t1
+; RV32ZVBC32-NEXT:    sw zero, 96(sp)
+; RV32ZVBC32-NEXT:    sw t1, 100(sp)
+; RV32ZVBC32-NEXT:    addi t5, sp, 136
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v16, t0
+; RV32ZVBC32-NEXT:    sw zero, 88(sp)
+; RV32ZVBC32-NEXT:    sw t0, 92(sp)
+; RV32ZVBC32-NEXT:    addi t4, sp, 128
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v16, a7
+; RV32ZVBC32-NEXT:    sw zero, 80(sp)
+; RV32ZVBC32-NEXT:    sw a7, 84(sp)
+; RV32ZVBC32-NEXT:    addi t2, sp, 120
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v16, a6
+; RV32ZVBC32-NEXT:    sw zero, 72(sp)
+; RV32ZVBC32-NEXT:    sw a6, 76(sp)
+; RV32ZVBC32-NEXT:    addi t3, sp, 112
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v16, a5
+; RV32ZVBC32-NEXT:    sw zero, 64(sp)
+; RV32ZVBC32-NEXT:    sw a5, 68(sp)
+; RV32ZVBC32-NEXT:    addi t1, sp, 104
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v16, a4
+; RV32ZVBC32-NEXT:    sw zero, 56(sp)
+; RV32ZVBC32-NEXT:    sw a4, 60(sp)
+; RV32ZVBC32-NEXT:    addi t0, sp, 96
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v16, a3
+; RV32ZVBC32-NEXT:    sw zero, 48(sp)
+; RV32ZVBC32-NEXT:    sw a3, 52(sp)
+; RV32ZVBC32-NEXT:    addi a7, sp, 88
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v16, a2
+; RV32ZVBC32-NEXT:    sw zero, 40(sp)
+; RV32ZVBC32-NEXT:    sw a2, 44(sp)
+; RV32ZVBC32-NEXT:    sw zero, 32(sp)
+; RV32ZVBC32-NEXT:    lui a0, 262144
+; RV32ZVBC32-NEXT:    sw a0, 36(sp)
+; RV32ZVBC32-NEXT:    sw zero, 24(sp)
+; RV32ZVBC32-NEXT:    sw a1, 28(sp)
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    sw t2, 4(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a1, vlenb
+; RV32ZVBC32-NEXT:    slli a1, a1, 3
+; RV32ZVBC32-NEXT:    mv a2, a1
+; RV32ZVBC32-NEXT:    slli a1, a1, 5
+; RV32ZVBC32-NEXT:    add a1, a1, a2
+; RV32ZVBC32-NEXT:    add a1, sp, a1
+; RV32ZVBC32-NEXT:    addi a1, a1, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v24, (s10), zero
+; RV32ZVBC32-NEXT:    addi a6, sp, 80
+; RV32ZVBC32-NEXT:    addi a5, sp, 72
+; RV32ZVBC32-NEXT:    addi a4, sp, 64
+; RV32ZVBC32-NEXT:    vand.vv v24, v16, v24
+; RV32ZVBC32-NEXT:    csrr a1, vlenb
+; RV32ZVBC32-NEXT:    slli a1, a1, 8
+; RV32ZVBC32-NEXT:    add a1, sp, a1
+; RV32ZVBC32-NEXT:    addi a1, a1, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi a1, sp, 56
+; RV32ZVBC32-NEXT:    addi s10, sp, 48
+; RV32ZVBC32-NEXT:    addi a3, sp, 40
+; RV32ZVBC32-NEXT:    addi a2, sp, 32
+; RV32ZVBC32-NEXT:    addi s7, sp, 272
+; RV32ZVBC32-NEXT:    vlse64.v v24, (s7), zero
+; RV32ZVBC32-NEXT:    csrr t2, vlenb
+; RV32ZVBC32-NEXT:    slli t2, t2, 4
+; RV32ZVBC32-NEXT:    mv s7, t2
+; RV32ZVBC32-NEXT:    slli t2, t2, 1
+; RV32ZVBC32-NEXT:    add s7, s7, t2
+; RV32ZVBC32-NEXT:    slli t2, t2, 1
+; RV32ZVBC32-NEXT:    add s7, s7, t2
+; RV32ZVBC32-NEXT:    slli t2, t2, 1
+; RV32ZVBC32-NEXT:    add t2, t2, s7
+; RV32ZVBC32-NEXT:    add t2, sp, t2
+; RV32ZVBC32-NEXT:    addi t2, t2, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (t2) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi s7, sp, 264
+; RV32ZVBC32-NEXT:    vlse64.v v0, (s7), zero
+; RV32ZVBC32-NEXT:    addi s7, sp, 256
+; RV32ZVBC32-NEXT:    vlse64.v v24, (s7), zero
+; RV32ZVBC32-NEXT:    csrr t2, vlenb
+; RV32ZVBC32-NEXT:    slli t2, t2, 3
+; RV32ZVBC32-NEXT:    mv s7, t2
+; RV32ZVBC32-NEXT:    slli t2, t2, 1
+; RV32ZVBC32-NEXT:    add s7, s7, t2
+; RV32ZVBC32-NEXT:    slli t2, t2, 1
+; RV32ZVBC32-NEXT:    add s7, s7, t2
+; RV32ZVBC32-NEXT:    slli t2, t2, 1
+; RV32ZVBC32-NEXT:    add s7, s7, t2
+; RV32ZVBC32-NEXT:    slli t2, t2, 1
+; RV32ZVBC32-NEXT:    add t2, t2, s7
+; RV32ZVBC32-NEXT:    add t2, sp, t2
+; RV32ZVBC32-NEXT:    addi t2, t2, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (t2) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi s7, sp, 248
+; RV32ZVBC32-NEXT:    vlse64.v v24, (s7), zero
+; RV32ZVBC32-NEXT:    csrr t2, vlenb
+; RV32ZVBC32-NEXT:    slli t2, t2, 3
+; RV32ZVBC32-NEXT:    mv s7, t2
+; RV32ZVBC32-NEXT:    slli t2, t2, 1
+; RV32ZVBC32-NEXT:    add s7, s7, t2
+; RV32ZVBC32-NEXT:    slli t2, t2, 2
+; RV32ZVBC32-NEXT:    add s7, s7, t2
+; RV32ZVBC32-NEXT:    slli t2, t2, 1
+; RV32ZVBC32-NEXT:    add t2, t2, s7
+; RV32ZVBC32-NEXT:    add t2, sp, t2
+; RV32ZVBC32-NEXT:    addi t2, t2, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (t2) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr t2, vlenb
+; RV32ZVBC32-NEXT:    slli t2, t2, 4
+; RV32ZVBC32-NEXT:    mv s7, t2
+; RV32ZVBC32-NEXT:    slli t2, t2, 1
+; RV32ZVBC32-NEXT:    add s7, s7, t2
+; RV32ZVBC32-NEXT:    slli t2, t2, 1
+; RV32ZVBC32-NEXT:    add s7, s7, t2
+; RV32ZVBC32-NEXT:    slli t2, t2, 1
+; RV32ZVBC32-NEXT:    add t2, t2, s7
+; RV32ZVBC32-NEXT:    add t2, sp, t2
+; RV32ZVBC32-NEXT:    addi t2, t2, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (t2) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v16, v24
+; RV32ZVBC32-NEXT:    csrr t2, vlenb
+; RV32ZVBC32-NEXT:    slli t2, t2, 5
+; RV32ZVBC32-NEXT:    mv s7, t2
+; RV32ZVBC32-NEXT:    slli t2, t2, 1
+; RV32ZVBC32-NEXT:    add s7, s7, t2
+; RV32ZVBC32-NEXT:    slli t2, t2, 1
+; RV32ZVBC32-NEXT:    add t2, t2, s7
+; RV32ZVBC32-NEXT:    add t2, sp, t2
+; RV32ZVBC32-NEXT:    addi t2, t2, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (t2) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v24, v16, v0
+; RV32ZVBC32-NEXT:    csrr t2, vlenb
+; RV32ZVBC32-NEXT:    slli t2, t2, 3
+; RV32ZVBC32-NEXT:    mv s7, t2
+; RV32ZVBC32-NEXT:    slli t2, t2, 2
+; RV32ZVBC32-NEXT:    add s7, s7, t2
+; RV32ZVBC32-NEXT:    slli t2, t2, 1
+; RV32ZVBC32-NEXT:    add s7, s7, t2
+; RV32ZVBC32-NEXT:    slli t2, t2, 1
+; RV32ZVBC32-NEXT:    add t2, t2, s7
+; RV32ZVBC32-NEXT:    add t2, sp, t2
+; RV32ZVBC32-NEXT:    addi t2, t2, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (t2) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr t2, vlenb
+; RV32ZVBC32-NEXT:    slli t2, t2, 3
+; RV32ZVBC32-NEXT:    mv s7, t2
+; RV32ZVBC32-NEXT:    slli t2, t2, 1
+; RV32ZVBC32-NEXT:    add s7, s7, t2
+; RV32ZVBC32-NEXT:    slli t2, t2, 1
+; RV32ZVBC32-NEXT:    add s7, s7, t2
+; RV32ZVBC32-NEXT:    slli t2, t2, 1
+; RV32ZVBC32-NEXT:    add s7, s7, t2
+; RV32ZVBC32-NEXT:    slli t2, t2, 1
+; RV32ZVBC32-NEXT:    add t2, t2, s7
+; RV32ZVBC32-NEXT:    add t2, sp, t2
+; RV32ZVBC32-NEXT:    addi t2, t2, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (t2) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v16, v24
+; RV32ZVBC32-NEXT:    csrr t2, vlenb
+; RV32ZVBC32-NEXT:    slli t2, t2, 4
+; RV32ZVBC32-NEXT:    mv s7, t2
+; RV32ZVBC32-NEXT:    slli t2, t2, 1
+; RV32ZVBC32-NEXT:    add s7, s7, t2
+; RV32ZVBC32-NEXT:    slli t2, t2, 1
+; RV32ZVBC32-NEXT:    add s7, s7, t2
+; RV32ZVBC32-NEXT:    slli t2, t2, 1
+; RV32ZVBC32-NEXT:    add t2, t2, s7
+; RV32ZVBC32-NEXT:    add t2, sp, t2
+; RV32ZVBC32-NEXT:    addi t2, t2, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (t2) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr t2, vlenb
+; RV32ZVBC32-NEXT:    slli t2, t2, 3
+; RV32ZVBC32-NEXT:    mv s7, t2
+; RV32ZVBC32-NEXT:    slli t2, t2, 1
+; RV32ZVBC32-NEXT:    add s7, s7, t2
+; RV32ZVBC32-NEXT:    slli t2, t2, 2
+; RV32ZVBC32-NEXT:    add s7, s7, t2
+; RV32ZVBC32-NEXT:    slli t2, t2, 1
+; RV32ZVBC32-NEXT:    add t2, t2, s7
+; RV32ZVBC32-NEXT:    add t2, sp, t2
+; RV32ZVBC32-NEXT:    addi t2, t2, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (t2) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v16, v24
+; RV32ZVBC32-NEXT:    csrr t2, vlenb
+; RV32ZVBC32-NEXT:    slli t2, t2, 3
+; RV32ZVBC32-NEXT:    mv s7, t2
+; RV32ZVBC32-NEXT:    slli t2, t2, 1
+; RV32ZVBC32-NEXT:    add s7, s7, t2
+; RV32ZVBC32-NEXT:    slli t2, t2, 1
+; RV32ZVBC32-NEXT:    add s7, s7, t2
+; RV32ZVBC32-NEXT:    slli t2, t2, 1
+; RV32ZVBC32-NEXT:    add s7, s7, t2
+; RV32ZVBC32-NEXT:    slli t2, t2, 1
+; RV32ZVBC32-NEXT:    add t2, t2, s7
+; RV32ZVBC32-NEXT:    add t2, sp, t2
+; RV32ZVBC32-NEXT:    addi t2, t2, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (t2) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi s7, sp, 24
+; RV32ZVBC32-NEXT:    addi s8, sp, 240
+; RV32ZVBC32-NEXT:    vlse64.v v24, (s8), zero
+; RV32ZVBC32-NEXT:    csrr s8, vlenb
+; RV32ZVBC32-NEXT:    slli s8, s8, 4
+; RV32ZVBC32-NEXT:    mv t2, s8
+; RV32ZVBC32-NEXT:    slli s8, s8, 2
+; RV32ZVBC32-NEXT:    add t2, t2, s8
+; RV32ZVBC32-NEXT:    slli s8, s8, 1
+; RV32ZVBC32-NEXT:    add s8, s8, t2
+; RV32ZVBC32-NEXT:    lw t2, 4(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    add s8, sp, s8
+; RV32ZVBC32-NEXT:    addi s8, s8, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (s8) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi s8, sp, 232
+; RV32ZVBC32-NEXT:    vlse64.v v0, (s8), zero
+; RV32ZVBC32-NEXT:    vlse64.v v24, (s5), zero
+; RV32ZVBC32-NEXT:    csrr s5, vlenb
+; RV32ZVBC32-NEXT:    slli s5, s5, 3
+; RV32ZVBC32-NEXT:    mv s8, s5
+; RV32ZVBC32-NEXT:    slli s5, s5, 1
+; RV32ZVBC32-NEXT:    add s8, s8, s5
+; RV32ZVBC32-NEXT:    slli s5, s5, 2
+; RV32ZVBC32-NEXT:    add s8, s8, s5
+; RV32ZVBC32-NEXT:    slli s5, s5, 1
+; RV32ZVBC32-NEXT:    add s5, s5, s8
+; RV32ZVBC32-NEXT:    add s5, sp, s5
+; RV32ZVBC32-NEXT:    addi s5, s5, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (s5) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v24, (ra), zero
+; RV32ZVBC32-NEXT:    csrr s5, vlenb
+; RV32ZVBC32-NEXT:    slli s5, s5, 3
+; RV32ZVBC32-NEXT:    mv s8, s5
+; RV32ZVBC32-NEXT:    slli s5, s5, 3
+; RV32ZVBC32-NEXT:    add s8, s8, s5
+; RV32ZVBC32-NEXT:    slli s5, s5, 1
+; RV32ZVBC32-NEXT:    add s5, s5, s8
+; RV32ZVBC32-NEXT:    add s5, sp, s5
+; RV32ZVBC32-NEXT:    addi s5, s5, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (s5) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr s5, vlenb
+; RV32ZVBC32-NEXT:    slli s5, s5, 4
+; RV32ZVBC32-NEXT:    mv s8, s5
+; RV32ZVBC32-NEXT:    slli s5, s5, 2
+; RV32ZVBC32-NEXT:    add s8, s8, s5
+; RV32ZVBC32-NEXT:    slli s5, s5, 1
+; RV32ZVBC32-NEXT:    add s5, s5, s8
+; RV32ZVBC32-NEXT:    add s5, sp, s5
+; RV32ZVBC32-NEXT:    addi s5, s5, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (s5) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v16, v24
+; RV32ZVBC32-NEXT:    csrr s5, vlenb
+; RV32ZVBC32-NEXT:    slli s5, s5, 3
+; RV32ZVBC32-NEXT:    mv s8, s5
+; RV32ZVBC32-NEXT:    slli s5, s5, 1
+; RV32ZVBC32-NEXT:    add s8, s8, s5
+; RV32ZVBC32-NEXT:    slli s5, s5, 1
+; RV32ZVBC32-NEXT:    add s5, s5, s8
+; RV32ZVBC32-NEXT:    add s5, sp, s5
+; RV32ZVBC32-NEXT:    addi s5, s5, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (s5) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v24, v16, v0
+; RV32ZVBC32-NEXT:    csrr s5, vlenb
+; RV32ZVBC32-NEXT:    slli s5, s5, 4
+; RV32ZVBC32-NEXT:    mv s8, s5
+; RV32ZVBC32-NEXT:    slli s5, s5, 1
+; RV32ZVBC32-NEXT:    add s8, s8, s5
+; RV32ZVBC32-NEXT:    slli s5, s5, 1
+; RV32ZVBC32-NEXT:    add s5, s5, s8
+; RV32ZVBC32-NEXT:    add s5, sp, s5
+; RV32ZVBC32-NEXT:    addi s5, s5, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (s5) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr s5, vlenb
+; RV32ZVBC32-NEXT:    slli s5, s5, 3
+; RV32ZVBC32-NEXT:    mv s8, s5
+; RV32ZVBC32-NEXT:    slli s5, s5, 1
+; RV32ZVBC32-NEXT:    add s8, s8, s5
+; RV32ZVBC32-NEXT:    slli s5, s5, 2
+; RV32ZVBC32-NEXT:    add s8, s8, s5
+; RV32ZVBC32-NEXT:    slli s5, s5, 1
+; RV32ZVBC32-NEXT:    add s5, s5, s8
+; RV32ZVBC32-NEXT:    add s5, sp, s5
+; RV32ZVBC32-NEXT:    addi s5, s5, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (s5) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v16, v24
+; RV32ZVBC32-NEXT:    csrr s5, vlenb
+; RV32ZVBC32-NEXT:    slli s5, s5, 3
+; RV32ZVBC32-NEXT:    mv s8, s5
+; RV32ZVBC32-NEXT:    slli s5, s5, 2
+; RV32ZVBC32-NEXT:    add s8, s8, s5
+; RV32ZVBC32-NEXT:    slli s5, s5, 2
+; RV32ZVBC32-NEXT:    add s5, s5, s8
+; RV32ZVBC32-NEXT:    add s5, sp, s5
+; RV32ZVBC32-NEXT:    addi s5, s5, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (s5) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr s5, vlenb
+; RV32ZVBC32-NEXT:    slli s5, s5, 3
+; RV32ZVBC32-NEXT:    mv s8, s5
+; RV32ZVBC32-NEXT:    slli s5, s5, 3
+; RV32ZVBC32-NEXT:    add s8, s8, s5
+; RV32ZVBC32-NEXT:    slli s5, s5, 1
+; RV32ZVBC32-NEXT:    add s5, s5, s8
+; RV32ZVBC32-NEXT:    add s5, sp, s5
+; RV32ZVBC32-NEXT:    addi s5, s5, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (s5) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v16, v24
+; RV32ZVBC32-NEXT:    csrr s5, vlenb
+; RV32ZVBC32-NEXT:    slli s5, s5, 3
+; RV32ZVBC32-NEXT:    mv s8, s5
+; RV32ZVBC32-NEXT:    slli s5, s5, 1
+; RV32ZVBC32-NEXT:    add s8, s8, s5
+; RV32ZVBC32-NEXT:    slli s5, s5, 2
+; RV32ZVBC32-NEXT:    add s8, s8, s5
+; RV32ZVBC32-NEXT:    slli s5, s5, 1
+; RV32ZVBC32-NEXT:    add s5, s5, s8
+; RV32ZVBC32-NEXT:    add s5, sp, s5
+; RV32ZVBC32-NEXT:    addi s5, s5, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (s5) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v0, (s11), zero
+; RV32ZVBC32-NEXT:    vlse64.v v24, (s9), zero
+; RV32ZVBC32-NEXT:    csrr s5, vlenb
+; RV32ZVBC32-NEXT:    slli s5, s5, 3
+; RV32ZVBC32-NEXT:    mv s8, s5
+; RV32ZVBC32-NEXT:    slli s5, s5, 3
+; RV32ZVBC32-NEXT:    add s8, s8, s5
+; RV32ZVBC32-NEXT:    slli s5, s5, 1
+; RV32ZVBC32-NEXT:    add s5, s5, s8
+; RV32ZVBC32-NEXT:    add s5, sp, s5
+; RV32ZVBC32-NEXT:    addi s5, s5, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (s5) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v24, (s6), zero
+; RV32ZVBC32-NEXT:    csrr s5, vlenb
+; RV32ZVBC32-NEXT:    slli s5, s5, 4
+; RV32ZVBC32-NEXT:    mv s6, s5
+; RV32ZVBC32-NEXT:    slli s5, s5, 2
+; RV32ZVBC32-NEXT:    add s6, s6, s5
+; RV32ZVBC32-NEXT:    slli s5, s5, 1
+; RV32ZVBC32-NEXT:    add s5, s5, s6
+; RV32ZVBC32-NEXT:    add s5, sp, s5
+; RV32ZVBC32-NEXT:    addi s5, s5, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (s5) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v24, (s3), zero
+; RV32ZVBC32-NEXT:    csrr s3, vlenb
+; RV32ZVBC32-NEXT:    slli s3, s3, 6
+; RV32ZVBC32-NEXT:    mv s5, s3
+; RV32ZVBC32-NEXT:    slli s3, s3, 1
+; RV32ZVBC32-NEXT:    add s3, s3, s5
+; RV32ZVBC32-NEXT:    add s3, sp, s3
+; RV32ZVBC32-NEXT:    addi s3, s3, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (s3) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v0, v16, v0
+; RV32ZVBC32-NEXT:    csrr s3, vlenb
+; RV32ZVBC32-NEXT:    slli s3, s3, 4
+; RV32ZVBC32-NEXT:    mv s5, s3
+; RV32ZVBC32-NEXT:    slli s3, s3, 1
+; RV32ZVBC32-NEXT:    add s3, s3, s5
+; RV32ZVBC32-NEXT:    add s3, sp, s3
+; RV32ZVBC32-NEXT:    addi s3, s3, 288
+; RV32ZVBC32-NEXT:    vs8r.v v0, (s3) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr s3, vlenb
+; RV32ZVBC32-NEXT:    slli s3, s3, 3
+; RV32ZVBC32-NEXT:    mv s5, s3
+; RV32ZVBC32-NEXT:    slli s3, s3, 3
+; RV32ZVBC32-NEXT:    add s5, s5, s3
+; RV32ZVBC32-NEXT:    slli s3, s3, 1
+; RV32ZVBC32-NEXT:    add s3, s3, s5
+; RV32ZVBC32-NEXT:    add s3, sp, s3
+; RV32ZVBC32-NEXT:    addi s3, s3, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (s3) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v16, v24
+; RV32ZVBC32-NEXT:    csrr s3, vlenb
+; RV32ZVBC32-NEXT:    slli s3, s3, 3
+; RV32ZVBC32-NEXT:    mv s5, s3
+; RV32ZVBC32-NEXT:    slli s3, s3, 2
+; RV32ZVBC32-NEXT:    add s5, s5, s3
+; RV32ZVBC32-NEXT:    slli s3, s3, 1
+; RV32ZVBC32-NEXT:    add s3, s3, s5
+; RV32ZVBC32-NEXT:    add s3, sp, s3
+; RV32ZVBC32-NEXT:    addi s3, s3, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (s3) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr s3, vlenb
+; RV32ZVBC32-NEXT:    slli s3, s3, 4
+; RV32ZVBC32-NEXT:    mv s5, s3
+; RV32ZVBC32-NEXT:    slli s3, s3, 2
+; RV32ZVBC32-NEXT:    add s5, s5, s3
+; RV32ZVBC32-NEXT:    slli s3, s3, 1
+; RV32ZVBC32-NEXT:    add s3, s3, s5
+; RV32ZVBC32-NEXT:    add s3, sp, s3
+; RV32ZVBC32-NEXT:    addi s3, s3, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (s3) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v16, v24
+; RV32ZVBC32-NEXT:    csrr s3, vlenb
+; RV32ZVBC32-NEXT:    slli s3, s3, 5
+; RV32ZVBC32-NEXT:    mv s5, s3
+; RV32ZVBC32-NEXT:    slli s3, s3, 2
+; RV32ZVBC32-NEXT:    add s3, s3, s5
+; RV32ZVBC32-NEXT:    add s3, sp, s3
+; RV32ZVBC32-NEXT:    addi s3, s3, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (s3) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr s3, vlenb
+; RV32ZVBC32-NEXT:    slli s3, s3, 6
+; RV32ZVBC32-NEXT:    mv s5, s3
+; RV32ZVBC32-NEXT:    slli s3, s3, 1
+; RV32ZVBC32-NEXT:    add s3, s3, s5
+; RV32ZVBC32-NEXT:    add s3, sp, s3
+; RV32ZVBC32-NEXT:    addi s3, s3, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (s3) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v16, v24
+; RV32ZVBC32-NEXT:    csrr s3, vlenb
+; RV32ZVBC32-NEXT:    slli s3, s3, 4
+; RV32ZVBC32-NEXT:    mv s5, s3
+; RV32ZVBC32-NEXT:    slli s3, s3, 2
+; RV32ZVBC32-NEXT:    add s5, s5, s3
+; RV32ZVBC32-NEXT:    slli s3, s3, 1
+; RV32ZVBC32-NEXT:    add s3, s3, s5
+; RV32ZVBC32-NEXT:    add s3, sp, s3
+; RV32ZVBC32-NEXT:    addi s3, s3, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (s3) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v24, (s4), zero
+; RV32ZVBC32-NEXT:    csrr s3, vlenb
+; RV32ZVBC32-NEXT:    slli s3, s3, 6
+; RV32ZVBC32-NEXT:    mv s4, s3
+; RV32ZVBC32-NEXT:    slli s3, s3, 1
+; RV32ZVBC32-NEXT:    add s3, s3, s4
+; RV32ZVBC32-NEXT:    add s3, sp, s3
+; RV32ZVBC32-NEXT:    addi s3, s3, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (s3) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v0, (s2), zero
+; RV32ZVBC32-NEXT:    vlse64.v v24, (s1), zero
+; RV32ZVBC32-NEXT:    csrr s1, vlenb
+; RV32ZVBC32-NEXT:    slli s1, s1, 3
+; RV32ZVBC32-NEXT:    mv s2, s1
+; RV32ZVBC32-NEXT:    slli s1, s1, 3
+; RV32ZVBC32-NEXT:    add s2, s2, s1
+; RV32ZVBC32-NEXT:    slli s1, s1, 1
+; RV32ZVBC32-NEXT:    add s1, s1, s2
+; RV32ZVBC32-NEXT:    add s1, sp, s1
+; RV32ZVBC32-NEXT:    addi s1, s1, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (s1) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v24, (t6), zero
+; RV32ZVBC32-NEXT:    csrr t6, vlenb
+; RV32ZVBC32-NEXT:    slli t6, t6, 3
+; RV32ZVBC32-NEXT:    mv s1, t6
+; RV32ZVBC32-NEXT:    slli t6, t6, 1
+; RV32ZVBC32-NEXT:    add s1, s1, t6
+; RV32ZVBC32-NEXT:    slli t6, t6, 1
+; RV32ZVBC32-NEXT:    add s1, s1, t6
+; RV32ZVBC32-NEXT:    slli t6, t6, 2
+; RV32ZVBC32-NEXT:    add t6, t6, s1
+; RV32ZVBC32-NEXT:    add t6, sp, t6
+; RV32ZVBC32-NEXT:    addi t6, t6, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (t6) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr t6, vlenb
+; RV32ZVBC32-NEXT:    slli t6, t6, 6
+; RV32ZVBC32-NEXT:    mv s1, t6
+; RV32ZVBC32-NEXT:    slli t6, t6, 1
+; RV32ZVBC32-NEXT:    add t6, t6, s1
+; RV32ZVBC32-NEXT:    add t6, sp, t6
+; RV32ZVBC32-NEXT:    addi t6, t6, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (t6) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v16, v24
+; RV32ZVBC32-NEXT:    csrr t6, vlenb
+; RV32ZVBC32-NEXT:    slli t6, t6, 3
+; RV32ZVBC32-NEXT:    mv s1, t6
+; RV32ZVBC32-NEXT:    slli t6, t6, 2
+; RV32ZVBC32-NEXT:    add t6, t6, s1
+; RV32ZVBC32-NEXT:    add t6, sp, t6
+; RV32ZVBC32-NEXT:    addi t6, t6, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (t6) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v24, v16, v0
+; RV32ZVBC32-NEXT:    csrr t6, vlenb
+; RV32ZVBC32-NEXT:    slli t6, t6, 5
+; RV32ZVBC32-NEXT:    mv s1, t6
+; RV32ZVBC32-NEXT:    slli t6, t6, 1
+; RV32ZVBC32-NEXT:    add t6, t6, s1
+; RV32ZVBC32-NEXT:    add t6, sp, t6
+; RV32ZVBC32-NEXT:    addi t6, t6, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (t6) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr t6, vlenb
+; RV32ZVBC32-NEXT:    slli t6, t6, 3
+; RV32ZVBC32-NEXT:    mv s1, t6
+; RV32ZVBC32-NEXT:    slli t6, t6, 3
+; RV32ZVBC32-NEXT:    add s1, s1, t6
+; RV32ZVBC32-NEXT:    slli t6, t6, 1
+; RV32ZVBC32-NEXT:    add t6, t6, s1
+; RV32ZVBC32-NEXT:    add t6, sp, t6
+; RV32ZVBC32-NEXT:    addi t6, t6, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (t6) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v16, v24
+; RV32ZVBC32-NEXT:    csrr t6, vlenb
+; RV32ZVBC32-NEXT:    slli t6, t6, 3
+; RV32ZVBC32-NEXT:    mv s1, t6
+; RV32ZVBC32-NEXT:    slli t6, t6, 1
+; RV32ZVBC32-NEXT:    add s1, s1, t6
+; RV32ZVBC32-NEXT:    slli t6, t6, 3
+; RV32ZVBC32-NEXT:    add t6, t6, s1
+; RV32ZVBC32-NEXT:    add t6, sp, t6
+; RV32ZVBC32-NEXT:    addi t6, t6, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (t6) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr t6, vlenb
+; RV32ZVBC32-NEXT:    slli t6, t6, 3
+; RV32ZVBC32-NEXT:    mv s1, t6
+; RV32ZVBC32-NEXT:    slli t6, t6, 1
+; RV32ZVBC32-NEXT:    add s1, s1, t6
+; RV32ZVBC32-NEXT:    slli t6, t6, 1
+; RV32ZVBC32-NEXT:    add s1, s1, t6
+; RV32ZVBC32-NEXT:    slli t6, t6, 2
+; RV32ZVBC32-NEXT:    add t6, t6, s1
+; RV32ZVBC32-NEXT:    add t6, sp, t6
+; RV32ZVBC32-NEXT:    addi t6, t6, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (t6) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v16, v24
+; RV32ZVBC32-NEXT:    csrr t6, vlenb
+; RV32ZVBC32-NEXT:    slli t6, t6, 3
+; RV32ZVBC32-NEXT:    mv s1, t6
+; RV32ZVBC32-NEXT:    slli t6, t6, 3
+; RV32ZVBC32-NEXT:    add s1, s1, t6
+; RV32ZVBC32-NEXT:    slli t6, t6, 1
+; RV32ZVBC32-NEXT:    add t6, t6, s1
+; RV32ZVBC32-NEXT:    add t6, sp, t6
+; RV32ZVBC32-NEXT:    addi t6, t6, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (t6) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v24, (s0), zero
+; RV32ZVBC32-NEXT:    csrr t6, vlenb
+; RV32ZVBC32-NEXT:    slli t6, t6, 3
+; RV32ZVBC32-NEXT:    mv s0, t6
+; RV32ZVBC32-NEXT:    slli t6, t6, 1
+; RV32ZVBC32-NEXT:    add s0, s0, t6
+; RV32ZVBC32-NEXT:    slli t6, t6, 1
+; RV32ZVBC32-NEXT:    add s0, s0, t6
+; RV32ZVBC32-NEXT:    slli t6, t6, 2
+; RV32ZVBC32-NEXT:    add t6, t6, s0
+; RV32ZVBC32-NEXT:    add t6, sp, t6
+; RV32ZVBC32-NEXT:    addi t6, t6, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (t6) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v0, (t5), zero
+; RV32ZVBC32-NEXT:    vlse64.v v24, (t4), zero
+; RV32ZVBC32-NEXT:    csrr t4, vlenb
+; RV32ZVBC32-NEXT:    slli t4, t4, 6
+; RV32ZVBC32-NEXT:    mv t5, t4
+; RV32ZVBC32-NEXT:    slli t4, t4, 1
+; RV32ZVBC32-NEXT:    add t4, t4, t5
+; RV32ZVBC32-NEXT:    add t4, sp, t4
+; RV32ZVBC32-NEXT:    addi t4, t4, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (t4) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v24, (t2), zero
+; RV32ZVBC32-NEXT:    csrr t2, vlenb
+; RV32ZVBC32-NEXT:    slli t2, t2, 4
+; RV32ZVBC32-NEXT:    mv t4, t2
+; RV32ZVBC32-NEXT:    slli t2, t2, 1
+; RV32ZVBC32-NEXT:    add t4, t4, t2
+; RV32ZVBC32-NEXT:    slli t2, t2, 2
+; RV32ZVBC32-NEXT:    add t2, t2, t4
+; RV32ZVBC32-NEXT:    add t2, sp, t2
+; RV32ZVBC32-NEXT:    addi t2, t2, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (t2) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr t2, vlenb
+; RV32ZVBC32-NEXT:    slli t2, t2, 3
+; RV32ZVBC32-NEXT:    mv t4, t2
+; RV32ZVBC32-NEXT:    slli t2, t2, 1
+; RV32ZVBC32-NEXT:    add t4, t4, t2
+; RV32ZVBC32-NEXT:    slli t2, t2, 1
+; RV32ZVBC32-NEXT:    add t4, t4, t2
+; RV32ZVBC32-NEXT:    slli t2, t2, 2
+; RV32ZVBC32-NEXT:    add t2, t2, t4
+; RV32ZVBC32-NEXT:    add t2, sp, t2
+; RV32ZVBC32-NEXT:    addi t2, t2, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (t2) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v16, v24
+; RV32ZVBC32-NEXT:    csrr t2, vlenb
+; RV32ZVBC32-NEXT:    slli t2, t2, 5
+; RV32ZVBC32-NEXT:    add t2, sp, t2
+; RV32ZVBC32-NEXT:    addi t2, t2, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (t2) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v24, v16, v0
+; RV32ZVBC32-NEXT:    csrr t2, vlenb
+; RV32ZVBC32-NEXT:    slli t2, t2, 3
+; RV32ZVBC32-NEXT:    mv t4, t2
+; RV32ZVBC32-NEXT:    slli t2, t2, 1
+; RV32ZVBC32-NEXT:    add t4, t4, t2
+; RV32ZVBC32-NEXT:    slli t2, t2, 2
+; RV32ZVBC32-NEXT:    add t2, t2, t4
+; RV32ZVBC32-NEXT:    add t2, sp, t2
+; RV32ZVBC32-NEXT:    addi t2, t2, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (t2) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr t2, vlenb
+; RV32ZVBC32-NEXT:    slli t2, t2, 6
+; RV32ZVBC32-NEXT:    mv t4, t2
+; RV32ZVBC32-NEXT:    slli t2, t2, 1
+; RV32ZVBC32-NEXT:    add t2, t2, t4
+; RV32ZVBC32-NEXT:    add t2, sp, t2
+; RV32ZVBC32-NEXT:    addi t2, t2, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (t2) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v16, v24
+; RV32ZVBC32-NEXT:    csrr t2, vlenb
+; RV32ZVBC32-NEXT:    slli t2, t2, 4
+; RV32ZVBC32-NEXT:    mv t4, t2
+; RV32ZVBC32-NEXT:    slli t2, t2, 3
+; RV32ZVBC32-NEXT:    add t2, t2, t4
+; RV32ZVBC32-NEXT:    add t2, sp, t2
+; RV32ZVBC32-NEXT:    addi t2, t2, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (t2) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr t2, vlenb
+; RV32ZVBC32-NEXT:    slli t2, t2, 4
+; RV32ZVBC32-NEXT:    mv t4, t2
+; RV32ZVBC32-NEXT:    slli t2, t2, 1
+; RV32ZVBC32-NEXT:    add t4, t4, t2
+; RV32ZVBC32-NEXT:    slli t2, t2, 2
+; RV32ZVBC32-NEXT:    add t2, t2, t4
+; RV32ZVBC32-NEXT:    add t2, sp, t2
+; RV32ZVBC32-NEXT:    addi t2, t2, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (t2) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v16, v24
+; RV32ZVBC32-NEXT:    csrr t2, vlenb
+; RV32ZVBC32-NEXT:    slli t2, t2, 6
+; RV32ZVBC32-NEXT:    mv t4, t2
+; RV32ZVBC32-NEXT:    slli t2, t2, 1
+; RV32ZVBC32-NEXT:    add t2, t2, t4
+; RV32ZVBC32-NEXT:    add t2, sp, t2
+; RV32ZVBC32-NEXT:    addi t2, t2, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (t2) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v24, (t3), zero
+; RV32ZVBC32-NEXT:    csrr t2, vlenb
+; RV32ZVBC32-NEXT:    slli t2, t2, 4
+; RV32ZVBC32-NEXT:    mv t3, t2
+; RV32ZVBC32-NEXT:    slli t2, t2, 1
+; RV32ZVBC32-NEXT:    add t3, t3, t2
+; RV32ZVBC32-NEXT:    slli t2, t2, 2
+; RV32ZVBC32-NEXT:    add t2, t2, t3
+; RV32ZVBC32-NEXT:    add t2, sp, t2
+; RV32ZVBC32-NEXT:    addi t2, t2, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (t2) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v0, (t1), zero
+; RV32ZVBC32-NEXT:    vlse64.v v24, (t0), zero
+; RV32ZVBC32-NEXT:    csrr t0, vlenb
+; RV32ZVBC32-NEXT:    slli t0, t0, 3
+; RV32ZVBC32-NEXT:    mv t1, t0
+; RV32ZVBC32-NEXT:    slli t0, t0, 1
+; RV32ZVBC32-NEXT:    add t1, t1, t0
+; RV32ZVBC32-NEXT:    slli t0, t0, 1
+; RV32ZVBC32-NEXT:    add t1, t1, t0
+; RV32ZVBC32-NEXT:    slli t0, t0, 2
+; RV32ZVBC32-NEXT:    add t0, t0, t1
+; RV32ZVBC32-NEXT:    add t0, sp, t0
+; RV32ZVBC32-NEXT:    addi t0, t0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (t0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v24, (a7), zero
+; RV32ZVBC32-NEXT:    csrr a7, vlenb
+; RV32ZVBC32-NEXT:    slli a7, a7, 7
+; RV32ZVBC32-NEXT:    add a7, sp, a7
+; RV32ZVBC32-NEXT:    addi a7, a7, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a7) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a7, vlenb
+; RV32ZVBC32-NEXT:    slli a7, a7, 4
+; RV32ZVBC32-NEXT:    mv t0, a7
+; RV32ZVBC32-NEXT:    slli a7, a7, 1
+; RV32ZVBC32-NEXT:    add t0, t0, a7
+; RV32ZVBC32-NEXT:    slli a7, a7, 2
+; RV32ZVBC32-NEXT:    add a7, a7, t0
+; RV32ZVBC32-NEXT:    add a7, sp, a7
+; RV32ZVBC32-NEXT:    addi a7, a7, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a7) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v16, v24
+; RV32ZVBC32-NEXT:    csrr a7, vlenb
+; RV32ZVBC32-NEXT:    slli a7, a7, 3
+; RV32ZVBC32-NEXT:    mv t0, a7
+; RV32ZVBC32-NEXT:    slli a7, a7, 1
+; RV32ZVBC32-NEXT:    add a7, a7, t0
+; RV32ZVBC32-NEXT:    add a7, sp, a7
+; RV32ZVBC32-NEXT:    addi a7, a7, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a7) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v24, v16, v0
+; RV32ZVBC32-NEXT:    csrr a7, vlenb
+; RV32ZVBC32-NEXT:    slli a7, a7, 4
+; RV32ZVBC32-NEXT:    mv t0, a7
+; RV32ZVBC32-NEXT:    slli a7, a7, 2
+; RV32ZVBC32-NEXT:    add a7, a7, t0
+; RV32ZVBC32-NEXT:    add a7, sp, a7
+; RV32ZVBC32-NEXT:    addi a7, a7, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a7) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a7, vlenb
+; RV32ZVBC32-NEXT:    slli a7, a7, 3
+; RV32ZVBC32-NEXT:    mv t0, a7
+; RV32ZVBC32-NEXT:    slli a7, a7, 1
+; RV32ZVBC32-NEXT:    add t0, t0, a7
+; RV32ZVBC32-NEXT:    slli a7, a7, 1
+; RV32ZVBC32-NEXT:    add t0, t0, a7
+; RV32ZVBC32-NEXT:    slli a7, a7, 2
+; RV32ZVBC32-NEXT:    add a7, a7, t0
+; RV32ZVBC32-NEXT:    add a7, sp, a7
+; RV32ZVBC32-NEXT:    addi a7, a7, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a7) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v16, v24
+; RV32ZVBC32-NEXT:    csrr a7, vlenb
+; RV32ZVBC32-NEXT:    slli a7, a7, 3
+; RV32ZVBC32-NEXT:    mv t0, a7
+; RV32ZVBC32-NEXT:    slli a7, a7, 4
+; RV32ZVBC32-NEXT:    add a7, a7, t0
+; RV32ZVBC32-NEXT:    add a7, sp, a7
+; RV32ZVBC32-NEXT:    addi a7, a7, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a7) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a7, vlenb
+; RV32ZVBC32-NEXT:    slli a7, a7, 7
+; RV32ZVBC32-NEXT:    add a7, sp, a7
+; RV32ZVBC32-NEXT:    addi a7, a7, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a7) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v16, v24
+; RV32ZVBC32-NEXT:    csrr a7, vlenb
+; RV32ZVBC32-NEXT:    slli a7, a7, 3
+; RV32ZVBC32-NEXT:    mv t0, a7
+; RV32ZVBC32-NEXT:    slli a7, a7, 1
+; RV32ZVBC32-NEXT:    add t0, t0, a7
+; RV32ZVBC32-NEXT:    slli a7, a7, 1
+; RV32ZVBC32-NEXT:    add t0, t0, a7
+; RV32ZVBC32-NEXT:    slli a7, a7, 2
+; RV32ZVBC32-NEXT:    add a7, a7, t0
+; RV32ZVBC32-NEXT:    add a7, sp, a7
+; RV32ZVBC32-NEXT:    addi a7, a7, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a7) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v24, (a6), zero
+; RV32ZVBC32-NEXT:    csrr a6, vlenb
+; RV32ZVBC32-NEXT:    slli a6, a6, 7
+; RV32ZVBC32-NEXT:    add a6, sp, a6
+; RV32ZVBC32-NEXT:    addi a6, a6, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a6) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v0, (a5), zero
+; RV32ZVBC32-NEXT:    vlse64.v v24, (a4), zero
+; RV32ZVBC32-NEXT:    csrr a4, vlenb
+; RV32ZVBC32-NEXT:    slli a4, a4, 4
+; RV32ZVBC32-NEXT:    mv a5, a4
+; RV32ZVBC32-NEXT:    slli a4, a4, 1
+; RV32ZVBC32-NEXT:    add a5, a5, a4
+; RV32ZVBC32-NEXT:    slli a4, a4, 2
+; RV32ZVBC32-NEXT:    add a4, a4, a5
+; RV32ZVBC32-NEXT:    add a4, sp, a4
+; RV32ZVBC32-NEXT:    addi a4, a4, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a4) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v24, (a1), zero
+; RV32ZVBC32-NEXT:    csrr a1, vlenb
+; RV32ZVBC32-NEXT:    slli a1, a1, 3
+; RV32ZVBC32-NEXT:    mv a4, a1
+; RV32ZVBC32-NEXT:    slli a1, a1, 1
+; RV32ZVBC32-NEXT:    add a4, a4, a1
+; RV32ZVBC32-NEXT:    slli a1, a1, 1
+; RV32ZVBC32-NEXT:    add a4, a4, a1
+; RV32ZVBC32-NEXT:    slli a1, a1, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a4
+; RV32ZVBC32-NEXT:    add a1, sp, a1
+; RV32ZVBC32-NEXT:    addi a1, a1, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a1, vlenb
+; RV32ZVBC32-NEXT:    slli a1, a1, 7
+; RV32ZVBC32-NEXT:    add a1, sp, a1
+; RV32ZVBC32-NEXT:    addi a1, a1, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v16, v24
+; RV32ZVBC32-NEXT:    csrr a1, vlenb
+; RV32ZVBC32-NEXT:    slli a1, a1, 4
+; RV32ZVBC32-NEXT:    add a1, sp, a1
+; RV32ZVBC32-NEXT:    addi a1, a1, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v24, v16, v0
+; RV32ZVBC32-NEXT:    csrr a1, vlenb
+; RV32ZVBC32-NEXT:    slli a1, a1, 3
+; RV32ZVBC32-NEXT:    mv a4, a1
+; RV32ZVBC32-NEXT:    slli a1, a1, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a4
+; RV32ZVBC32-NEXT:    add a1, sp, a1
+; RV32ZVBC32-NEXT:    addi a1, a1, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a1, vlenb
+; RV32ZVBC32-NEXT:    slli a1, a1, 4
+; RV32ZVBC32-NEXT:    mv a4, a1
+; RV32ZVBC32-NEXT:    slli a1, a1, 1
+; RV32ZVBC32-NEXT:    add a4, a4, a1
+; RV32ZVBC32-NEXT:    slli a1, a1, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a4
+; RV32ZVBC32-NEXT:    add a1, sp, a1
+; RV32ZVBC32-NEXT:    addi a1, a1, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v16, v24
+; RV32ZVBC32-NEXT:    csrr a1, vlenb
+; RV32ZVBC32-NEXT:    slli a1, a1, 7
+; RV32ZVBC32-NEXT:    add a1, sp, a1
+; RV32ZVBC32-NEXT:    addi a1, a1, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a1, vlenb
+; RV32ZVBC32-NEXT:    slli a1, a1, 3
+; RV32ZVBC32-NEXT:    mv a4, a1
+; RV32ZVBC32-NEXT:    slli a1, a1, 1
+; RV32ZVBC32-NEXT:    add a4, a4, a1
+; RV32ZVBC32-NEXT:    slli a1, a1, 1
+; RV32ZVBC32-NEXT:    add a4, a4, a1
+; RV32ZVBC32-NEXT:    slli a1, a1, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a4
+; RV32ZVBC32-NEXT:    add a1, sp, a1
+; RV32ZVBC32-NEXT:    addi a1, a1, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v16, v24
+; RV32ZVBC32-NEXT:    csrr a1, vlenb
+; RV32ZVBC32-NEXT:    slli a1, a1, 4
+; RV32ZVBC32-NEXT:    mv a4, a1
+; RV32ZVBC32-NEXT:    slli a1, a1, 1
+; RV32ZVBC32-NEXT:    add a4, a4, a1
+; RV32ZVBC32-NEXT:    slli a1, a1, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a4
+; RV32ZVBC32-NEXT:    add a1, sp, a1
+; RV32ZVBC32-NEXT:    addi a1, a1, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v24, (s10), zero
+; RV32ZVBC32-NEXT:    csrr a1, vlenb
+; RV32ZVBC32-NEXT:    slli a1, a1, 6
+; RV32ZVBC32-NEXT:    add a1, sp, a1
+; RV32ZVBC32-NEXT:    addi a1, a1, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v0, (a3), zero
+; RV32ZVBC32-NEXT:    vlse64.v v24, (a2), zero
+; RV32ZVBC32-NEXT:    csrr a1, vlenb
+; RV32ZVBC32-NEXT:    slli a1, a1, 3
+; RV32ZVBC32-NEXT:    mv a2, a1
+; RV32ZVBC32-NEXT:    slli a1, a1, 1
+; RV32ZVBC32-NEXT:    add a2, a2, a1
+; RV32ZVBC32-NEXT:    slli a1, a1, 1
+; RV32ZVBC32-NEXT:    add a2, a2, a1
+; RV32ZVBC32-NEXT:    slli a1, a1, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a2
+; RV32ZVBC32-NEXT:    add a1, sp, a1
+; RV32ZVBC32-NEXT:    addi a1, a1, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v24, (s7), zero
+; RV32ZVBC32-NEXT:    addi a1, sp, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a1, vlenb
+; RV32ZVBC32-NEXT:    slli a1, a1, 6
+; RV32ZVBC32-NEXT:    add a1, sp, a1
+; RV32ZVBC32-NEXT:    addi a1, a1, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v16, v24
+; RV32ZVBC32-NEXT:    csrr a1, vlenb
+; RV32ZVBC32-NEXT:    slli a1, a1, 3
+; RV32ZVBC32-NEXT:    add a1, sp, a1
+; RV32ZVBC32-NEXT:    addi a1, a1, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v24, v16, v0
+; RV32ZVBC32-NEXT:    csrr a1, vlenb
+; RV32ZVBC32-NEXT:    slli a1, a1, 6
+; RV32ZVBC32-NEXT:    add a1, sp, a1
+; RV32ZVBC32-NEXT:    addi a1, a1, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a1, vlenb
+; RV32ZVBC32-NEXT:    slli a1, a1, 3
+; RV32ZVBC32-NEXT:    mv a2, a1
+; RV32ZVBC32-NEXT:    slli a1, a1, 1
+; RV32ZVBC32-NEXT:    add a2, a2, a1
+; RV32ZVBC32-NEXT:    slli a1, a1, 1
+; RV32ZVBC32-NEXT:    add a2, a2, a1
+; RV32ZVBC32-NEXT:    slli a1, a1, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a2
+; RV32ZVBC32-NEXT:    add a1, sp, a1
+; RV32ZVBC32-NEXT:    addi a1, a1, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v16, v24
+; RV32ZVBC32-NEXT:    csrr a1, vlenb
+; RV32ZVBC32-NEXT:    slli a1, a1, 3
+; RV32ZVBC32-NEXT:    mv a2, a1
+; RV32ZVBC32-NEXT:    slli a1, a1, 1
+; RV32ZVBC32-NEXT:    add a2, a2, a1
+; RV32ZVBC32-NEXT:    slli a1, a1, 1
+; RV32ZVBC32-NEXT:    add a2, a2, a1
+; RV32ZVBC32-NEXT:    slli a1, a1, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a2
+; RV32ZVBC32-NEXT:    add a1, sp, a1
+; RV32ZVBC32-NEXT:    addi a1, a1, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi a1, sp, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v0, v16, v24
+; RV32ZVBC32-NEXT:    vand.vx v16, v16, a0
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v24, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 8
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 7
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    vmul.vv v8, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add sp, sp, a0
+; RV32ZVBC32-NEXT:    lw ra, 348(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s0, 344(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s1, 340(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s2, 336(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s3, 332(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s4, 328(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s5, 324(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s6, 320(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s7, 316(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s8, 312(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s9, 308(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s10, 304(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s11, 300(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    addi sp, sp, 352
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmul_nxv8i64_vv:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV64ZVBC32-NEXT:    vand.vi v24, v16, 2
+; RV64ZVBC32-NEXT:    vand.vi v0, v16, 1
+; RV64ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV64ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v0, v24
+; RV64ZVBC32-NEXT:    vand.vi v0, v16, 4
+; RV64ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vi v0, v16, 8
+; RV64ZVBC32-NEXT:    li a0, 16
+; RV64ZVBC32-NEXT:    li a1, 32
+; RV64ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC32-NEXT:    li a0, 64
+; RV64ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v16, a1
+; RV64ZVBC32-NEXT:    li a1, 128
+; RV64ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC32-NEXT:    li a0, 256
+; RV64ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v16, a1
+; RV64ZVBC32-NEXT:    li a1, 512
+; RV64ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC32-NEXT:    li a2, 1024
+; RV64ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v16, a1
+; RV64ZVBC32-NEXT:    li a0, 1
+; RV64ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v16, a2
+; RV64ZVBC32-NEXT:    slli a1, a0, 11
+; RV64ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v16, a1
+; RV64ZVBC32-NEXT:    lui a1, 1
+; RV64ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v16, a1
+; RV64ZVBC32-NEXT:    lui a1, 2
+; RV64ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v16, a1
+; RV64ZVBC32-NEXT:    lui a1, 4
+; RV64ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v16, a1
+; RV64ZVBC32-NEXT:    lui a1, 8
+; RV64ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v16, a1
+; RV64ZVBC32-NEXT:    lui a1, 16
+; RV64ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v16, a1
+; RV64ZVBC32-NEXT:    lui a1, 32
+; RV64ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v16, a1
+; RV64ZVBC32-NEXT:    lui a1, 64
+; RV64ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v16, a1
+; RV64ZVBC32-NEXT:    lui a1, 128
+; RV64ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v16, a1
+; RV64ZVBC32-NEXT:    lui a1, 256
+; RV64ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v16, a1
+; RV64ZVBC32-NEXT:    lui a1, 512
+; RV64ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v16, a1
+; RV64ZVBC32-NEXT:    lui a1, 1024
+; RV64ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v16, a1
+; RV64ZVBC32-NEXT:    lui a1, 2048
+; RV64ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v16, a1
+; RV64ZVBC32-NEXT:    lui a1, 4096
+; RV64ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v16, a1
+; RV64ZVBC32-NEXT:    lui a1, 8192
+; RV64ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v16, a1
+; RV64ZVBC32-NEXT:    lui a1, 16384
+; RV64ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v16, a1
+; RV64ZVBC32-NEXT:    lui a1, 32768
+; RV64ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v16, a1
+; RV64ZVBC32-NEXT:    lui a1, 65536
+; RV64ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v16, a1
+; RV64ZVBC32-NEXT:    lui a1, 131072
+; RV64ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v16, a1
+; RV64ZVBC32-NEXT:    lui a1, 262144
+; RV64ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v16, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 31
+; RV64ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v16, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 32
+; RV64ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v16, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 33
+; RV64ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v16, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 34
+; RV64ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v16, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 35
+; RV64ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v16, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 36
+; RV64ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v16, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 37
+; RV64ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v16, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 38
+; RV64ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v16, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 39
+; RV64ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v16, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 40
+; RV64ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v16, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 41
+; RV64ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v16, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 42
+; RV64ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v16, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 43
+; RV64ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v16, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 44
+; RV64ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v16, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 45
+; RV64ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v16, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 46
+; RV64ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v16, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 47
+; RV64ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v16, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 48
+; RV64ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v16, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 49
+; RV64ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v16, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 50
+; RV64ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v16, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 51
+; RV64ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v16, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 52
+; RV64ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v16, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 53
+; RV64ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v16, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 54
+; RV64ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v16, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 55
+; RV64ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v16, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 56
+; RV64ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v16, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 57
+; RV64ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v16, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 58
+; RV64ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v16, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 59
+; RV64ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v16, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 60
+; RV64ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v16, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 61
+; RV64ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v16, a1
+; RV64ZVBC32-NEXT:    li a1, -1
+; RV64ZVBC32-NEXT:    slli a0, a0, 62
+; RV64ZVBC32-NEXT:    slli a1, a1, 63
+; RV64ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC32-NEXT:    vand.vx v16, v16, a1
+; RV64ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vmul.vv v8, v8, v16
+; RV64ZVBC32-NEXT:    vxor.vv v8, v24, v8
+; RV64ZVBC32-NEXT:    ret
   %v = call <vscale x 8 x i64> @llvm.clmul.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %vb)
   ret <vscale x 8 x i64> %v
 }
@@ -15052,23 +26591,1737 @@ define <vscale x 8 x i64> @clmul_nxv8i64_vx(<vscale x 8 x i64> %va, i64 %b) noun
 ; RV64V-NEXT:    vxor.vv v8, v16, v8
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmul_nxv8i64_vx:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    addi sp, sp, -16
-; RV32ZVBC-NEXT:    sw a0, 8(sp)
-; RV32ZVBC-NEXT:    sw a1, 12(sp)
-; RV32ZVBC-NEXT:    addi a0, sp, 8
-; RV32ZVBC-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32ZVBC-NEXT:    vlse64.v v16, (a0), zero
-; RV32ZVBC-NEXT:    vclmul.vv v8, v8, v16
-; RV32ZVBC-NEXT:    addi sp, sp, 16
-; RV32ZVBC-NEXT:    ret
+; RV32ZVBC64-LABEL: clmul_nxv8i64_vx:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    addi sp, sp, -16
+; RV32ZVBC64-NEXT:    sw a0, 8(sp)
+; RV32ZVBC64-NEXT:    sw a1, 12(sp)
+; RV32ZVBC64-NEXT:    addi a0, sp, 8
+; RV32ZVBC64-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32ZVBC64-NEXT:    vlse64.v v16, (a0), zero
+; RV32ZVBC64-NEXT:    vclmul.vv v8, v8, v16
+; RV32ZVBC64-NEXT:    addi sp, sp, 16
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmul_nxv8i64_vx:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV64ZVBC64-NEXT:    vclmul.vx v8, v8, a0
+; RV64ZVBC64-NEXT:    ret
+;
+; RV32ZVBC32-LABEL: clmul_nxv8i64_vx:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    addi sp, sp, -352
+; RV32ZVBC32-NEXT:    sw ra, 348(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s0, 344(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s1, 340(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s2, 336(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s3, 332(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s4, 328(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s5, 324(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s6, 320(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s7, 316(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s8, 312(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s9, 308(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s10, 304(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s11, 300(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a2, vlenb
+; RV32ZVBC32-NEXT:    slli a2, a2, 4
+; RV32ZVBC32-NEXT:    mv a3, a2
+; RV32ZVBC32-NEXT:    slli a2, a2, 4
+; RV32ZVBC32-NEXT:    add a2, a2, a3
+; RV32ZVBC32-NEXT:    sub sp, sp, a2
+; RV32ZVBC32-NEXT:    sw a0, 8(sp)
+; RV32ZVBC32-NEXT:    sw a1, 12(sp)
+; RV32ZVBC32-NEXT:    addi a0, sp, 8
+; RV32ZVBC32-NEXT:    lui a1, 524288
+; RV32ZVBC32-NEXT:    li ra, 1
+; RV32ZVBC32-NEXT:    li a6, 2
+; RV32ZVBC32-NEXT:    li s8, 4
+; RV32ZVBC32-NEXT:    li s11, 8
+; RV32ZVBC32-NEXT:    li s10, 128
+; RV32ZVBC32-NEXT:    li s9, 256
+; RV32ZVBC32-NEXT:    li s7, 512
+; RV32ZVBC32-NEXT:    li s6, 1024
+; RV32ZVBC32-NEXT:    lui s5, 1
+; RV32ZVBC32-NEXT:    lui s4, 2
+; RV32ZVBC32-NEXT:    lui s3, 4
+; RV32ZVBC32-NEXT:    lui s2, 8
+; RV32ZVBC32-NEXT:    lui s1, 16
+; RV32ZVBC32-NEXT:    lui s0, 32
+; RV32ZVBC32-NEXT:    lui t6, 64
+; RV32ZVBC32-NEXT:    lui t5, 128
+; RV32ZVBC32-NEXT:    lui t4, 256
+; RV32ZVBC32-NEXT:    lui t2, 512
+; RV32ZVBC32-NEXT:    lui t1, 1024
+; RV32ZVBC32-NEXT:    lui t0, 2048
+; RV32ZVBC32-NEXT:    lui a7, 4096
+; RV32ZVBC32-NEXT:    lui a5, 8192
+; RV32ZVBC32-NEXT:    lui a4, 16384
+; RV32ZVBC32-NEXT:    vsetvli a3, zero, e64, m8, ta, ma
+; RV32ZVBC32-NEXT:    vlse64.v v16, (a0), zero
+; RV32ZVBC32-NEXT:    lui a2, 32768
+; RV32ZVBC32-NEXT:    sw a1, 16(sp)
+; RV32ZVBC32-NEXT:    lui t3, 524288
+; RV32ZVBC32-NEXT:    sw zero, 20(sp)
+; RV32ZVBC32-NEXT:    sw zero, 272(sp)
+; RV32ZVBC32-NEXT:    sw ra, 276(sp)
+; RV32ZVBC32-NEXT:    sw zero, 264(sp)
+; RV32ZVBC32-NEXT:    sw a6, 268(sp)
+; RV32ZVBC32-NEXT:    lui a6, 65536
+; RV32ZVBC32-NEXT:    sw zero, 256(sp)
+; RV32ZVBC32-NEXT:    sw s8, 260(sp)
+; RV32ZVBC32-NEXT:    lui a1, 131072
+; RV32ZVBC32-NEXT:    sw zero, 248(sp)
+; RV32ZVBC32-NEXT:    sw s11, 252(sp)
+; RV32ZVBC32-NEXT:    lui a0, 262144
+; RV32ZVBC32-NEXT:    sw zero, 240(sp)
+; RV32ZVBC32-NEXT:    li a3, 16
+; RV32ZVBC32-NEXT:    sw a3, 244(sp)
+; RV32ZVBC32-NEXT:    li s8, 16
+; RV32ZVBC32-NEXT:    sw zero, 232(sp)
+; RV32ZVBC32-NEXT:    li a3, 32
+; RV32ZVBC32-NEXT:    sw a3, 236(sp)
+; RV32ZVBC32-NEXT:    li a3, 32
+; RV32ZVBC32-NEXT:    sw zero, 224(sp)
+; RV32ZVBC32-NEXT:    li s11, 64
+; RV32ZVBC32-NEXT:    sw s11, 228(sp)
+; RV32ZVBC32-NEXT:    sw zero, 216(sp)
+; RV32ZVBC32-NEXT:    sw s10, 220(sp)
+; RV32ZVBC32-NEXT:    sw zero, 208(sp)
+; RV32ZVBC32-NEXT:    sw s9, 212(sp)
+; RV32ZVBC32-NEXT:    li s9, 256
+; RV32ZVBC32-NEXT:    sw zero, 200(sp)
+; RV32ZVBC32-NEXT:    sw s7, 204(sp)
+; RV32ZVBC32-NEXT:    li s7, 512
+; RV32ZVBC32-NEXT:    sw zero, 192(sp)
+; RV32ZVBC32-NEXT:    sw s6, 196(sp)
+; RV32ZVBC32-NEXT:    li s6, 1024
+; RV32ZVBC32-NEXT:    slli ra, ra, 11
+; RV32ZVBC32-NEXT:    sw zero, 184(sp)
+; RV32ZVBC32-NEXT:    sw ra, 188(sp)
+; RV32ZVBC32-NEXT:    sw zero, 176(sp)
+; RV32ZVBC32-NEXT:    sw s5, 180(sp)
+; RV32ZVBC32-NEXT:    lui s11, 1
+; RV32ZVBC32-NEXT:    sw zero, 168(sp)
+; RV32ZVBC32-NEXT:    sw s4, 172(sp)
+; RV32ZVBC32-NEXT:    sw zero, 160(sp)
+; RV32ZVBC32-NEXT:    sw s3, 164(sp)
+; RV32ZVBC32-NEXT:    sw zero, 152(sp)
+; RV32ZVBC32-NEXT:    sw s2, 156(sp)
+; RV32ZVBC32-NEXT:    sw zero, 144(sp)
+; RV32ZVBC32-NEXT:    sw s1, 148(sp)
+; RV32ZVBC32-NEXT:    sw zero, 136(sp)
+; RV32ZVBC32-NEXT:    sw s0, 140(sp)
+; RV32ZVBC32-NEXT:    sw zero, 128(sp)
+; RV32ZVBC32-NEXT:    sw t6, 132(sp)
+; RV32ZVBC32-NEXT:    sw zero, 120(sp)
+; RV32ZVBC32-NEXT:    sw t5, 124(sp)
+; RV32ZVBC32-NEXT:    sw zero, 112(sp)
+; RV32ZVBC32-NEXT:    sw t4, 116(sp)
+; RV32ZVBC32-NEXT:    sw zero, 104(sp)
+; RV32ZVBC32-NEXT:    sw t2, 108(sp)
+; RV32ZVBC32-NEXT:    sw zero, 96(sp)
+; RV32ZVBC32-NEXT:    sw t1, 100(sp)
+; RV32ZVBC32-NEXT:    sw zero, 88(sp)
+; RV32ZVBC32-NEXT:    sw t0, 92(sp)
+; RV32ZVBC32-NEXT:    sw zero, 80(sp)
+; RV32ZVBC32-NEXT:    sw a7, 84(sp)
+; RV32ZVBC32-NEXT:    sw zero, 72(sp)
+; RV32ZVBC32-NEXT:    sw a5, 76(sp)
+; RV32ZVBC32-NEXT:    sw zero, 64(sp)
+; RV32ZVBC32-NEXT:    sw a4, 68(sp)
+; RV32ZVBC32-NEXT:    sw zero, 56(sp)
+; RV32ZVBC32-NEXT:    sw a2, 60(sp)
+; RV32ZVBC32-NEXT:    sw zero, 48(sp)
+; RV32ZVBC32-NEXT:    sw a6, 52(sp)
+; RV32ZVBC32-NEXT:    sw zero, 40(sp)
+; RV32ZVBC32-NEXT:    sw a1, 44(sp)
+; RV32ZVBC32-NEXT:    lui s10, 131072
+; RV32ZVBC32-NEXT:    sw zero, 32(sp)
+; RV32ZVBC32-NEXT:    sw a0, 36(sp)
+; RV32ZVBC32-NEXT:    sw zero, 24(sp)
+; RV32ZVBC32-NEXT:    sw t3, 28(sp)
+; RV32ZVBC32-NEXT:    addi a1, sp, 16
+; RV32ZVBC32-NEXT:    vand.vi v24, v16, 2
+; RV32ZVBC32-NEXT:    vand.vi v0, v16, 1
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v0, v24
+; RV32ZVBC32-NEXT:    vand.vi v0, v16, 4
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vi v0, v16, 8
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v16, s8
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v16, a3
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    li a0, 64
+; RV32ZVBC32-NEXT:    vand.vx v0, v16, a0
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    li a0, 128
+; RV32ZVBC32-NEXT:    vand.vx v0, v16, a0
+; RV32ZVBC32-NEXT:    addi s8, sp, 248
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v16, s9
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v16, s7
+; RV32ZVBC32-NEXT:    addi s9, sp, 232
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v16, s6
+; RV32ZVBC32-NEXT:    addi s7, sp, 224
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v16, ra
+; RV32ZVBC32-NEXT:    addi ra, sp, 216
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v16, s11
+; RV32ZVBC32-NEXT:    addi s6, sp, 208
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v16, s4
+; RV32ZVBC32-NEXT:    addi s5, sp, 200
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v16, s3
+; RV32ZVBC32-NEXT:    addi s4, sp, 192
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v16, s2
+; RV32ZVBC32-NEXT:    addi s2, sp, 184
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v16, s1
+; RV32ZVBC32-NEXT:    addi s3, sp, 176
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v16, s0
+; RV32ZVBC32-NEXT:    addi s1, sp, 168
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v16, t6
+; RV32ZVBC32-NEXT:    addi s0, sp, 160
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v16, t5
+; RV32ZVBC32-NEXT:    addi t5, sp, 152
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v16, t4
+; RV32ZVBC32-NEXT:    addi t6, sp, 144
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v16, t2
+; RV32ZVBC32-NEXT:    addi t4, sp, 136
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v16, t1
+; RV32ZVBC32-NEXT:    addi t3, sp, 128
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v16, t0
+; RV32ZVBC32-NEXT:    addi t1, sp, 120
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v16, a7
+; RV32ZVBC32-NEXT:    addi t2, sp, 112
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v16, a5
+; RV32ZVBC32-NEXT:    addi t0, sp, 104
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v16, a4
+; RV32ZVBC32-NEXT:    addi a7, sp, 96
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v16, a2
+; RV32ZVBC32-NEXT:    addi a6, sp, 88
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    lui a0, 65536
+; RV32ZVBC32-NEXT:    vand.vx v0, v16, a0
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v16, s10
+; RV32ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    sw t1, 4(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a2, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, a0, a2
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v24, (a1), zero
+; RV32ZVBC32-NEXT:    addi a5, sp, 80
+; RV32ZVBC32-NEXT:    addi a4, sp, 72
+; RV32ZVBC32-NEXT:    addi a2, sp, 64
+; RV32ZVBC32-NEXT:    vand.vv v24, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 8
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi a0, sp, 56
+; RV32ZVBC32-NEXT:    addi s11, sp, 48
+; RV32ZVBC32-NEXT:    addi a3, sp, 40
+; RV32ZVBC32-NEXT:    addi a1, sp, 32
+; RV32ZVBC32-NEXT:    addi s10, sp, 272
+; RV32ZVBC32-NEXT:    vlse64.v v24, (s10), zero
+; RV32ZVBC32-NEXT:    csrr t1, vlenb
+; RV32ZVBC32-NEXT:    slli t1, t1, 4
+; RV32ZVBC32-NEXT:    mv s10, t1
+; RV32ZVBC32-NEXT:    slli t1, t1, 1
+; RV32ZVBC32-NEXT:    add s10, s10, t1
+; RV32ZVBC32-NEXT:    slli t1, t1, 1
+; RV32ZVBC32-NEXT:    add s10, s10, t1
+; RV32ZVBC32-NEXT:    slli t1, t1, 1
+; RV32ZVBC32-NEXT:    add t1, t1, s10
+; RV32ZVBC32-NEXT:    add t1, sp, t1
+; RV32ZVBC32-NEXT:    addi t1, t1, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (t1) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi s10, sp, 264
+; RV32ZVBC32-NEXT:    vlse64.v v0, (s10), zero
+; RV32ZVBC32-NEXT:    addi s10, sp, 256
+; RV32ZVBC32-NEXT:    vlse64.v v24, (s10), zero
+; RV32ZVBC32-NEXT:    csrr t1, vlenb
+; RV32ZVBC32-NEXT:    slli t1, t1, 3
+; RV32ZVBC32-NEXT:    mv s10, t1
+; RV32ZVBC32-NEXT:    slli t1, t1, 1
+; RV32ZVBC32-NEXT:    add s10, s10, t1
+; RV32ZVBC32-NEXT:    slli t1, t1, 1
+; RV32ZVBC32-NEXT:    add s10, s10, t1
+; RV32ZVBC32-NEXT:    slli t1, t1, 1
+; RV32ZVBC32-NEXT:    add s10, s10, t1
+; RV32ZVBC32-NEXT:    slli t1, t1, 1
+; RV32ZVBC32-NEXT:    add t1, t1, s10
+; RV32ZVBC32-NEXT:    add t1, sp, t1
+; RV32ZVBC32-NEXT:    addi t1, t1, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (t1) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v24, (s8), zero
+; RV32ZVBC32-NEXT:    csrr t1, vlenb
+; RV32ZVBC32-NEXT:    slli t1, t1, 3
+; RV32ZVBC32-NEXT:    mv s8, t1
+; RV32ZVBC32-NEXT:    slli t1, t1, 1
+; RV32ZVBC32-NEXT:    add s8, s8, t1
+; RV32ZVBC32-NEXT:    slli t1, t1, 2
+; RV32ZVBC32-NEXT:    add s8, s8, t1
+; RV32ZVBC32-NEXT:    slli t1, t1, 1
+; RV32ZVBC32-NEXT:    add t1, t1, s8
+; RV32ZVBC32-NEXT:    add t1, sp, t1
+; RV32ZVBC32-NEXT:    addi t1, t1, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (t1) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr t1, vlenb
+; RV32ZVBC32-NEXT:    slli t1, t1, 4
+; RV32ZVBC32-NEXT:    mv s8, t1
+; RV32ZVBC32-NEXT:    slli t1, t1, 1
+; RV32ZVBC32-NEXT:    add s8, s8, t1
+; RV32ZVBC32-NEXT:    slli t1, t1, 1
+; RV32ZVBC32-NEXT:    add s8, s8, t1
+; RV32ZVBC32-NEXT:    slli t1, t1, 1
+; RV32ZVBC32-NEXT:    add t1, t1, s8
+; RV32ZVBC32-NEXT:    add t1, sp, t1
+; RV32ZVBC32-NEXT:    addi t1, t1, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (t1) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v16, v24
+; RV32ZVBC32-NEXT:    csrr t1, vlenb
+; RV32ZVBC32-NEXT:    slli t1, t1, 5
+; RV32ZVBC32-NEXT:    mv s8, t1
+; RV32ZVBC32-NEXT:    slli t1, t1, 1
+; RV32ZVBC32-NEXT:    add s8, s8, t1
+; RV32ZVBC32-NEXT:    slli t1, t1, 1
+; RV32ZVBC32-NEXT:    add t1, t1, s8
+; RV32ZVBC32-NEXT:    add t1, sp, t1
+; RV32ZVBC32-NEXT:    addi t1, t1, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (t1) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v24, v16, v0
+; RV32ZVBC32-NEXT:    csrr t1, vlenb
+; RV32ZVBC32-NEXT:    slli t1, t1, 3
+; RV32ZVBC32-NEXT:    mv s8, t1
+; RV32ZVBC32-NEXT:    slli t1, t1, 2
+; RV32ZVBC32-NEXT:    add s8, s8, t1
+; RV32ZVBC32-NEXT:    slli t1, t1, 1
+; RV32ZVBC32-NEXT:    add s8, s8, t1
+; RV32ZVBC32-NEXT:    slli t1, t1, 1
+; RV32ZVBC32-NEXT:    add t1, t1, s8
+; RV32ZVBC32-NEXT:    add t1, sp, t1
+; RV32ZVBC32-NEXT:    addi t1, t1, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (t1) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr t1, vlenb
+; RV32ZVBC32-NEXT:    slli t1, t1, 3
+; RV32ZVBC32-NEXT:    mv s8, t1
+; RV32ZVBC32-NEXT:    slli t1, t1, 1
+; RV32ZVBC32-NEXT:    add s8, s8, t1
+; RV32ZVBC32-NEXT:    slli t1, t1, 1
+; RV32ZVBC32-NEXT:    add s8, s8, t1
+; RV32ZVBC32-NEXT:    slli t1, t1, 1
+; RV32ZVBC32-NEXT:    add s8, s8, t1
+; RV32ZVBC32-NEXT:    slli t1, t1, 1
+; RV32ZVBC32-NEXT:    add t1, t1, s8
+; RV32ZVBC32-NEXT:    add t1, sp, t1
+; RV32ZVBC32-NEXT:    addi t1, t1, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (t1) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v16, v24
+; RV32ZVBC32-NEXT:    csrr t1, vlenb
+; RV32ZVBC32-NEXT:    slli t1, t1, 4
+; RV32ZVBC32-NEXT:    mv s8, t1
+; RV32ZVBC32-NEXT:    slli t1, t1, 1
+; RV32ZVBC32-NEXT:    add s8, s8, t1
+; RV32ZVBC32-NEXT:    slli t1, t1, 1
+; RV32ZVBC32-NEXT:    add s8, s8, t1
+; RV32ZVBC32-NEXT:    slli t1, t1, 1
+; RV32ZVBC32-NEXT:    add t1, t1, s8
+; RV32ZVBC32-NEXT:    add t1, sp, t1
+; RV32ZVBC32-NEXT:    addi t1, t1, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (t1) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr t1, vlenb
+; RV32ZVBC32-NEXT:    slli t1, t1, 3
+; RV32ZVBC32-NEXT:    mv s8, t1
+; RV32ZVBC32-NEXT:    slli t1, t1, 1
+; RV32ZVBC32-NEXT:    add s8, s8, t1
+; RV32ZVBC32-NEXT:    slli t1, t1, 2
+; RV32ZVBC32-NEXT:    add s8, s8, t1
+; RV32ZVBC32-NEXT:    slli t1, t1, 1
+; RV32ZVBC32-NEXT:    add t1, t1, s8
+; RV32ZVBC32-NEXT:    add t1, sp, t1
+; RV32ZVBC32-NEXT:    addi t1, t1, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (t1) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v16, v24
+; RV32ZVBC32-NEXT:    csrr t1, vlenb
+; RV32ZVBC32-NEXT:    slli t1, t1, 3
+; RV32ZVBC32-NEXT:    mv s8, t1
+; RV32ZVBC32-NEXT:    slli t1, t1, 1
+; RV32ZVBC32-NEXT:    add s8, s8, t1
+; RV32ZVBC32-NEXT:    slli t1, t1, 1
+; RV32ZVBC32-NEXT:    add s8, s8, t1
+; RV32ZVBC32-NEXT:    slli t1, t1, 1
+; RV32ZVBC32-NEXT:    add s8, s8, t1
+; RV32ZVBC32-NEXT:    slli t1, t1, 1
+; RV32ZVBC32-NEXT:    add t1, t1, s8
+; RV32ZVBC32-NEXT:    add t1, sp, t1
+; RV32ZVBC32-NEXT:    addi t1, t1, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (t1) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi s8, sp, 24
+; RV32ZVBC32-NEXT:    addi s10, sp, 240
+; RV32ZVBC32-NEXT:    vlse64.v v24, (s10), zero
+; RV32ZVBC32-NEXT:    csrr s10, vlenb
+; RV32ZVBC32-NEXT:    slli s10, s10, 4
+; RV32ZVBC32-NEXT:    mv t1, s10
+; RV32ZVBC32-NEXT:    slli s10, s10, 2
+; RV32ZVBC32-NEXT:    add t1, t1, s10
+; RV32ZVBC32-NEXT:    slli s10, s10, 1
+; RV32ZVBC32-NEXT:    add s10, s10, t1
+; RV32ZVBC32-NEXT:    lw t1, 4(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    add s10, sp, s10
+; RV32ZVBC32-NEXT:    addi s10, s10, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (s10) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v0, (s9), zero
+; RV32ZVBC32-NEXT:    vlse64.v v24, (s7), zero
+; RV32ZVBC32-NEXT:    csrr s7, vlenb
+; RV32ZVBC32-NEXT:    slli s7, s7, 3
+; RV32ZVBC32-NEXT:    mv s9, s7
+; RV32ZVBC32-NEXT:    slli s7, s7, 1
+; RV32ZVBC32-NEXT:    add s9, s9, s7
+; RV32ZVBC32-NEXT:    slli s7, s7, 2
+; RV32ZVBC32-NEXT:    add s9, s9, s7
+; RV32ZVBC32-NEXT:    slli s7, s7, 1
+; RV32ZVBC32-NEXT:    add s7, s7, s9
+; RV32ZVBC32-NEXT:    add s7, sp, s7
+; RV32ZVBC32-NEXT:    addi s7, s7, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (s7) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v24, (ra), zero
+; RV32ZVBC32-NEXT:    csrr s7, vlenb
+; RV32ZVBC32-NEXT:    slli s7, s7, 3
+; RV32ZVBC32-NEXT:    mv s9, s7
+; RV32ZVBC32-NEXT:    slli s7, s7, 3
+; RV32ZVBC32-NEXT:    add s9, s9, s7
+; RV32ZVBC32-NEXT:    slli s7, s7, 1
+; RV32ZVBC32-NEXT:    add s7, s7, s9
+; RV32ZVBC32-NEXT:    add s7, sp, s7
+; RV32ZVBC32-NEXT:    addi s7, s7, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (s7) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr s7, vlenb
+; RV32ZVBC32-NEXT:    slli s7, s7, 4
+; RV32ZVBC32-NEXT:    mv s9, s7
+; RV32ZVBC32-NEXT:    slli s7, s7, 2
+; RV32ZVBC32-NEXT:    add s9, s9, s7
+; RV32ZVBC32-NEXT:    slli s7, s7, 1
+; RV32ZVBC32-NEXT:    add s7, s7, s9
+; RV32ZVBC32-NEXT:    add s7, sp, s7
+; RV32ZVBC32-NEXT:    addi s7, s7, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (s7) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v16, v24
+; RV32ZVBC32-NEXT:    csrr s7, vlenb
+; RV32ZVBC32-NEXT:    slli s7, s7, 3
+; RV32ZVBC32-NEXT:    mv s9, s7
+; RV32ZVBC32-NEXT:    slli s7, s7, 1
+; RV32ZVBC32-NEXT:    add s9, s9, s7
+; RV32ZVBC32-NEXT:    slli s7, s7, 1
+; RV32ZVBC32-NEXT:    add s7, s7, s9
+; RV32ZVBC32-NEXT:    add s7, sp, s7
+; RV32ZVBC32-NEXT:    addi s7, s7, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (s7) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v24, v16, v0
+; RV32ZVBC32-NEXT:    csrr s7, vlenb
+; RV32ZVBC32-NEXT:    slli s7, s7, 4
+; RV32ZVBC32-NEXT:    mv s9, s7
+; RV32ZVBC32-NEXT:    slli s7, s7, 1
+; RV32ZVBC32-NEXT:    add s9, s9, s7
+; RV32ZVBC32-NEXT:    slli s7, s7, 1
+; RV32ZVBC32-NEXT:    add s7, s7, s9
+; RV32ZVBC32-NEXT:    add s7, sp, s7
+; RV32ZVBC32-NEXT:    addi s7, s7, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (s7) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr s7, vlenb
+; RV32ZVBC32-NEXT:    slli s7, s7, 3
+; RV32ZVBC32-NEXT:    mv s9, s7
+; RV32ZVBC32-NEXT:    slli s7, s7, 1
+; RV32ZVBC32-NEXT:    add s9, s9, s7
+; RV32ZVBC32-NEXT:    slli s7, s7, 2
+; RV32ZVBC32-NEXT:    add s9, s9, s7
+; RV32ZVBC32-NEXT:    slli s7, s7, 1
+; RV32ZVBC32-NEXT:    add s7, s7, s9
+; RV32ZVBC32-NEXT:    add s7, sp, s7
+; RV32ZVBC32-NEXT:    addi s7, s7, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (s7) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v16, v24
+; RV32ZVBC32-NEXT:    csrr s7, vlenb
+; RV32ZVBC32-NEXT:    slli s7, s7, 3
+; RV32ZVBC32-NEXT:    mv s9, s7
+; RV32ZVBC32-NEXT:    slli s7, s7, 2
+; RV32ZVBC32-NEXT:    add s9, s9, s7
+; RV32ZVBC32-NEXT:    slli s7, s7, 2
+; RV32ZVBC32-NEXT:    add s7, s7, s9
+; RV32ZVBC32-NEXT:    add s7, sp, s7
+; RV32ZVBC32-NEXT:    addi s7, s7, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (s7) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr s7, vlenb
+; RV32ZVBC32-NEXT:    slli s7, s7, 3
+; RV32ZVBC32-NEXT:    mv s9, s7
+; RV32ZVBC32-NEXT:    slli s7, s7, 3
+; RV32ZVBC32-NEXT:    add s9, s9, s7
+; RV32ZVBC32-NEXT:    slli s7, s7, 1
+; RV32ZVBC32-NEXT:    add s7, s7, s9
+; RV32ZVBC32-NEXT:    add s7, sp, s7
+; RV32ZVBC32-NEXT:    addi s7, s7, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (s7) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v16, v24
+; RV32ZVBC32-NEXT:    csrr s7, vlenb
+; RV32ZVBC32-NEXT:    slli s7, s7, 3
+; RV32ZVBC32-NEXT:    mv s9, s7
+; RV32ZVBC32-NEXT:    slli s7, s7, 1
+; RV32ZVBC32-NEXT:    add s9, s9, s7
+; RV32ZVBC32-NEXT:    slli s7, s7, 2
+; RV32ZVBC32-NEXT:    add s9, s9, s7
+; RV32ZVBC32-NEXT:    slli s7, s7, 1
+; RV32ZVBC32-NEXT:    add s7, s7, s9
+; RV32ZVBC32-NEXT:    add s7, sp, s7
+; RV32ZVBC32-NEXT:    addi s7, s7, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (s7) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v0, (s6), zero
+; RV32ZVBC32-NEXT:    vlse64.v v24, (s5), zero
+; RV32ZVBC32-NEXT:    csrr s5, vlenb
+; RV32ZVBC32-NEXT:    slli s5, s5, 3
+; RV32ZVBC32-NEXT:    mv s6, s5
+; RV32ZVBC32-NEXT:    slli s5, s5, 3
+; RV32ZVBC32-NEXT:    add s6, s6, s5
+; RV32ZVBC32-NEXT:    slli s5, s5, 1
+; RV32ZVBC32-NEXT:    add s5, s5, s6
+; RV32ZVBC32-NEXT:    add s5, sp, s5
+; RV32ZVBC32-NEXT:    addi s5, s5, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (s5) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v24, (s4), zero
+; RV32ZVBC32-NEXT:    csrr s4, vlenb
+; RV32ZVBC32-NEXT:    slli s4, s4, 4
+; RV32ZVBC32-NEXT:    mv s5, s4
+; RV32ZVBC32-NEXT:    slli s4, s4, 2
+; RV32ZVBC32-NEXT:    add s5, s5, s4
+; RV32ZVBC32-NEXT:    slli s4, s4, 1
+; RV32ZVBC32-NEXT:    add s4, s4, s5
+; RV32ZVBC32-NEXT:    add s4, sp, s4
+; RV32ZVBC32-NEXT:    addi s4, s4, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (s4) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v24, (s2), zero
+; RV32ZVBC32-NEXT:    csrr s2, vlenb
+; RV32ZVBC32-NEXT:    slli s2, s2, 6
+; RV32ZVBC32-NEXT:    mv s4, s2
+; RV32ZVBC32-NEXT:    slli s2, s2, 1
+; RV32ZVBC32-NEXT:    add s2, s2, s4
+; RV32ZVBC32-NEXT:    add s2, sp, s2
+; RV32ZVBC32-NEXT:    addi s2, s2, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (s2) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v0, v16, v0
+; RV32ZVBC32-NEXT:    csrr s2, vlenb
+; RV32ZVBC32-NEXT:    slli s2, s2, 4
+; RV32ZVBC32-NEXT:    mv s4, s2
+; RV32ZVBC32-NEXT:    slli s2, s2, 1
+; RV32ZVBC32-NEXT:    add s2, s2, s4
+; RV32ZVBC32-NEXT:    add s2, sp, s2
+; RV32ZVBC32-NEXT:    addi s2, s2, 288
+; RV32ZVBC32-NEXT:    vs8r.v v0, (s2) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr s2, vlenb
+; RV32ZVBC32-NEXT:    slli s2, s2, 3
+; RV32ZVBC32-NEXT:    mv s4, s2
+; RV32ZVBC32-NEXT:    slli s2, s2, 3
+; RV32ZVBC32-NEXT:    add s4, s4, s2
+; RV32ZVBC32-NEXT:    slli s2, s2, 1
+; RV32ZVBC32-NEXT:    add s2, s2, s4
+; RV32ZVBC32-NEXT:    add s2, sp, s2
+; RV32ZVBC32-NEXT:    addi s2, s2, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (s2) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v16, v24
+; RV32ZVBC32-NEXT:    csrr s2, vlenb
+; RV32ZVBC32-NEXT:    slli s2, s2, 3
+; RV32ZVBC32-NEXT:    mv s4, s2
+; RV32ZVBC32-NEXT:    slli s2, s2, 2
+; RV32ZVBC32-NEXT:    add s4, s4, s2
+; RV32ZVBC32-NEXT:    slli s2, s2, 1
+; RV32ZVBC32-NEXT:    add s2, s2, s4
+; RV32ZVBC32-NEXT:    add s2, sp, s2
+; RV32ZVBC32-NEXT:    addi s2, s2, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (s2) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr s2, vlenb
+; RV32ZVBC32-NEXT:    slli s2, s2, 4
+; RV32ZVBC32-NEXT:    mv s4, s2
+; RV32ZVBC32-NEXT:    slli s2, s2, 2
+; RV32ZVBC32-NEXT:    add s4, s4, s2
+; RV32ZVBC32-NEXT:    slli s2, s2, 1
+; RV32ZVBC32-NEXT:    add s2, s2, s4
+; RV32ZVBC32-NEXT:    add s2, sp, s2
+; RV32ZVBC32-NEXT:    addi s2, s2, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (s2) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v16, v24
+; RV32ZVBC32-NEXT:    csrr s2, vlenb
+; RV32ZVBC32-NEXT:    slli s2, s2, 5
+; RV32ZVBC32-NEXT:    mv s4, s2
+; RV32ZVBC32-NEXT:    slli s2, s2, 2
+; RV32ZVBC32-NEXT:    add s2, s2, s4
+; RV32ZVBC32-NEXT:    add s2, sp, s2
+; RV32ZVBC32-NEXT:    addi s2, s2, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (s2) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr s2, vlenb
+; RV32ZVBC32-NEXT:    slli s2, s2, 6
+; RV32ZVBC32-NEXT:    mv s4, s2
+; RV32ZVBC32-NEXT:    slli s2, s2, 1
+; RV32ZVBC32-NEXT:    add s2, s2, s4
+; RV32ZVBC32-NEXT:    add s2, sp, s2
+; RV32ZVBC32-NEXT:    addi s2, s2, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (s2) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v16, v24
+; RV32ZVBC32-NEXT:    csrr s2, vlenb
+; RV32ZVBC32-NEXT:    slli s2, s2, 4
+; RV32ZVBC32-NEXT:    mv s4, s2
+; RV32ZVBC32-NEXT:    slli s2, s2, 2
+; RV32ZVBC32-NEXT:    add s4, s4, s2
+; RV32ZVBC32-NEXT:    slli s2, s2, 1
+; RV32ZVBC32-NEXT:    add s2, s2, s4
+; RV32ZVBC32-NEXT:    add s2, sp, s2
+; RV32ZVBC32-NEXT:    addi s2, s2, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (s2) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v24, (s3), zero
+; RV32ZVBC32-NEXT:    csrr s2, vlenb
+; RV32ZVBC32-NEXT:    slli s2, s2, 6
+; RV32ZVBC32-NEXT:    mv s3, s2
+; RV32ZVBC32-NEXT:    slli s2, s2, 1
+; RV32ZVBC32-NEXT:    add s2, s2, s3
+; RV32ZVBC32-NEXT:    add s2, sp, s2
+; RV32ZVBC32-NEXT:    addi s2, s2, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (s2) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v0, (s1), zero
+; RV32ZVBC32-NEXT:    vlse64.v v24, (s0), zero
+; RV32ZVBC32-NEXT:    csrr s0, vlenb
+; RV32ZVBC32-NEXT:    slli s0, s0, 3
+; RV32ZVBC32-NEXT:    mv s1, s0
+; RV32ZVBC32-NEXT:    slli s0, s0, 3
+; RV32ZVBC32-NEXT:    add s1, s1, s0
+; RV32ZVBC32-NEXT:    slli s0, s0, 1
+; RV32ZVBC32-NEXT:    add s0, s0, s1
+; RV32ZVBC32-NEXT:    add s0, sp, s0
+; RV32ZVBC32-NEXT:    addi s0, s0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (s0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v24, (t5), zero
+; RV32ZVBC32-NEXT:    csrr t5, vlenb
+; RV32ZVBC32-NEXT:    slli t5, t5, 3
+; RV32ZVBC32-NEXT:    mv s0, t5
+; RV32ZVBC32-NEXT:    slli t5, t5, 1
+; RV32ZVBC32-NEXT:    add s0, s0, t5
+; RV32ZVBC32-NEXT:    slli t5, t5, 1
+; RV32ZVBC32-NEXT:    add s0, s0, t5
+; RV32ZVBC32-NEXT:    slli t5, t5, 2
+; RV32ZVBC32-NEXT:    add t5, t5, s0
+; RV32ZVBC32-NEXT:    add t5, sp, t5
+; RV32ZVBC32-NEXT:    addi t5, t5, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (t5) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr t5, vlenb
+; RV32ZVBC32-NEXT:    slli t5, t5, 6
+; RV32ZVBC32-NEXT:    mv s0, t5
+; RV32ZVBC32-NEXT:    slli t5, t5, 1
+; RV32ZVBC32-NEXT:    add t5, t5, s0
+; RV32ZVBC32-NEXT:    add t5, sp, t5
+; RV32ZVBC32-NEXT:    addi t5, t5, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (t5) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v16, v24
+; RV32ZVBC32-NEXT:    csrr t5, vlenb
+; RV32ZVBC32-NEXT:    slli t5, t5, 3
+; RV32ZVBC32-NEXT:    mv s0, t5
+; RV32ZVBC32-NEXT:    slli t5, t5, 2
+; RV32ZVBC32-NEXT:    add t5, t5, s0
+; RV32ZVBC32-NEXT:    add t5, sp, t5
+; RV32ZVBC32-NEXT:    addi t5, t5, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (t5) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v24, v16, v0
+; RV32ZVBC32-NEXT:    csrr t5, vlenb
+; RV32ZVBC32-NEXT:    slli t5, t5, 5
+; RV32ZVBC32-NEXT:    mv s0, t5
+; RV32ZVBC32-NEXT:    slli t5, t5, 1
+; RV32ZVBC32-NEXT:    add t5, t5, s0
+; RV32ZVBC32-NEXT:    add t5, sp, t5
+; RV32ZVBC32-NEXT:    addi t5, t5, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (t5) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr t5, vlenb
+; RV32ZVBC32-NEXT:    slli t5, t5, 3
+; RV32ZVBC32-NEXT:    mv s0, t5
+; RV32ZVBC32-NEXT:    slli t5, t5, 3
+; RV32ZVBC32-NEXT:    add s0, s0, t5
+; RV32ZVBC32-NEXT:    slli t5, t5, 1
+; RV32ZVBC32-NEXT:    add t5, t5, s0
+; RV32ZVBC32-NEXT:    add t5, sp, t5
+; RV32ZVBC32-NEXT:    addi t5, t5, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (t5) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v16, v24
+; RV32ZVBC32-NEXT:    csrr t5, vlenb
+; RV32ZVBC32-NEXT:    slli t5, t5, 3
+; RV32ZVBC32-NEXT:    mv s0, t5
+; RV32ZVBC32-NEXT:    slli t5, t5, 1
+; RV32ZVBC32-NEXT:    add s0, s0, t5
+; RV32ZVBC32-NEXT:    slli t5, t5, 3
+; RV32ZVBC32-NEXT:    add t5, t5, s0
+; RV32ZVBC32-NEXT:    add t5, sp, t5
+; RV32ZVBC32-NEXT:    addi t5, t5, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (t5) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr t5, vlenb
+; RV32ZVBC32-NEXT:    slli t5, t5, 3
+; RV32ZVBC32-NEXT:    mv s0, t5
+; RV32ZVBC32-NEXT:    slli t5, t5, 1
+; RV32ZVBC32-NEXT:    add s0, s0, t5
+; RV32ZVBC32-NEXT:    slli t5, t5, 1
+; RV32ZVBC32-NEXT:    add s0, s0, t5
+; RV32ZVBC32-NEXT:    slli t5, t5, 2
+; RV32ZVBC32-NEXT:    add t5, t5, s0
+; RV32ZVBC32-NEXT:    add t5, sp, t5
+; RV32ZVBC32-NEXT:    addi t5, t5, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (t5) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v16, v24
+; RV32ZVBC32-NEXT:    csrr t5, vlenb
+; RV32ZVBC32-NEXT:    slli t5, t5, 3
+; RV32ZVBC32-NEXT:    mv s0, t5
+; RV32ZVBC32-NEXT:    slli t5, t5, 3
+; RV32ZVBC32-NEXT:    add s0, s0, t5
+; RV32ZVBC32-NEXT:    slli t5, t5, 1
+; RV32ZVBC32-NEXT:    add t5, t5, s0
+; RV32ZVBC32-NEXT:    add t5, sp, t5
+; RV32ZVBC32-NEXT:    addi t5, t5, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (t5) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v24, (t6), zero
+; RV32ZVBC32-NEXT:    csrr t5, vlenb
+; RV32ZVBC32-NEXT:    slli t5, t5, 3
+; RV32ZVBC32-NEXT:    mv t6, t5
+; RV32ZVBC32-NEXT:    slli t5, t5, 1
+; RV32ZVBC32-NEXT:    add t6, t6, t5
+; RV32ZVBC32-NEXT:    slli t5, t5, 1
+; RV32ZVBC32-NEXT:    add t6, t6, t5
+; RV32ZVBC32-NEXT:    slli t5, t5, 2
+; RV32ZVBC32-NEXT:    add t5, t5, t6
+; RV32ZVBC32-NEXT:    add t5, sp, t5
+; RV32ZVBC32-NEXT:    addi t5, t5, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (t5) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v0, (t4), zero
+; RV32ZVBC32-NEXT:    vlse64.v v24, (t3), zero
+; RV32ZVBC32-NEXT:    csrr t3, vlenb
+; RV32ZVBC32-NEXT:    slli t3, t3, 6
+; RV32ZVBC32-NEXT:    mv t4, t3
+; RV32ZVBC32-NEXT:    slli t3, t3, 1
+; RV32ZVBC32-NEXT:    add t3, t3, t4
+; RV32ZVBC32-NEXT:    add t3, sp, t3
+; RV32ZVBC32-NEXT:    addi t3, t3, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (t3) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v24, (t1), zero
+; RV32ZVBC32-NEXT:    csrr t1, vlenb
+; RV32ZVBC32-NEXT:    slli t1, t1, 4
+; RV32ZVBC32-NEXT:    mv t3, t1
+; RV32ZVBC32-NEXT:    slli t1, t1, 1
+; RV32ZVBC32-NEXT:    add t3, t3, t1
+; RV32ZVBC32-NEXT:    slli t1, t1, 2
+; RV32ZVBC32-NEXT:    add t1, t1, t3
+; RV32ZVBC32-NEXT:    add t1, sp, t1
+; RV32ZVBC32-NEXT:    addi t1, t1, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (t1) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr t1, vlenb
+; RV32ZVBC32-NEXT:    slli t1, t1, 3
+; RV32ZVBC32-NEXT:    mv t3, t1
+; RV32ZVBC32-NEXT:    slli t1, t1, 1
+; RV32ZVBC32-NEXT:    add t3, t3, t1
+; RV32ZVBC32-NEXT:    slli t1, t1, 1
+; RV32ZVBC32-NEXT:    add t3, t3, t1
+; RV32ZVBC32-NEXT:    slli t1, t1, 2
+; RV32ZVBC32-NEXT:    add t1, t1, t3
+; RV32ZVBC32-NEXT:    add t1, sp, t1
+; RV32ZVBC32-NEXT:    addi t1, t1, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (t1) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v16, v24
+; RV32ZVBC32-NEXT:    csrr t1, vlenb
+; RV32ZVBC32-NEXT:    slli t1, t1, 5
+; RV32ZVBC32-NEXT:    add t1, sp, t1
+; RV32ZVBC32-NEXT:    addi t1, t1, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (t1) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v24, v16, v0
+; RV32ZVBC32-NEXT:    csrr t1, vlenb
+; RV32ZVBC32-NEXT:    slli t1, t1, 3
+; RV32ZVBC32-NEXT:    mv t3, t1
+; RV32ZVBC32-NEXT:    slli t1, t1, 1
+; RV32ZVBC32-NEXT:    add t3, t3, t1
+; RV32ZVBC32-NEXT:    slli t1, t1, 2
+; RV32ZVBC32-NEXT:    add t1, t1, t3
+; RV32ZVBC32-NEXT:    add t1, sp, t1
+; RV32ZVBC32-NEXT:    addi t1, t1, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (t1) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr t1, vlenb
+; RV32ZVBC32-NEXT:    slli t1, t1, 6
+; RV32ZVBC32-NEXT:    mv t3, t1
+; RV32ZVBC32-NEXT:    slli t1, t1, 1
+; RV32ZVBC32-NEXT:    add t1, t1, t3
+; RV32ZVBC32-NEXT:    add t1, sp, t1
+; RV32ZVBC32-NEXT:    addi t1, t1, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (t1) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v16, v24
+; RV32ZVBC32-NEXT:    csrr t1, vlenb
+; RV32ZVBC32-NEXT:    slli t1, t1, 4
+; RV32ZVBC32-NEXT:    mv t3, t1
+; RV32ZVBC32-NEXT:    slli t1, t1, 3
+; RV32ZVBC32-NEXT:    add t1, t1, t3
+; RV32ZVBC32-NEXT:    add t1, sp, t1
+; RV32ZVBC32-NEXT:    addi t1, t1, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (t1) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr t1, vlenb
+; RV32ZVBC32-NEXT:    slli t1, t1, 4
+; RV32ZVBC32-NEXT:    mv t3, t1
+; RV32ZVBC32-NEXT:    slli t1, t1, 1
+; RV32ZVBC32-NEXT:    add t3, t3, t1
+; RV32ZVBC32-NEXT:    slli t1, t1, 2
+; RV32ZVBC32-NEXT:    add t1, t1, t3
+; RV32ZVBC32-NEXT:    add t1, sp, t1
+; RV32ZVBC32-NEXT:    addi t1, t1, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (t1) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v16, v24
+; RV32ZVBC32-NEXT:    csrr t1, vlenb
+; RV32ZVBC32-NEXT:    slli t1, t1, 6
+; RV32ZVBC32-NEXT:    mv t3, t1
+; RV32ZVBC32-NEXT:    slli t1, t1, 1
+; RV32ZVBC32-NEXT:    add t1, t1, t3
+; RV32ZVBC32-NEXT:    add t1, sp, t1
+; RV32ZVBC32-NEXT:    addi t1, t1, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (t1) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v24, (t2), zero
+; RV32ZVBC32-NEXT:    csrr t1, vlenb
+; RV32ZVBC32-NEXT:    slli t1, t1, 4
+; RV32ZVBC32-NEXT:    mv t2, t1
+; RV32ZVBC32-NEXT:    slli t1, t1, 1
+; RV32ZVBC32-NEXT:    add t2, t2, t1
+; RV32ZVBC32-NEXT:    slli t1, t1, 2
+; RV32ZVBC32-NEXT:    add t1, t1, t2
+; RV32ZVBC32-NEXT:    add t1, sp, t1
+; RV32ZVBC32-NEXT:    addi t1, t1, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (t1) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v0, (t0), zero
+; RV32ZVBC32-NEXT:    vlse64.v v24, (a7), zero
+; RV32ZVBC32-NEXT:    csrr a7, vlenb
+; RV32ZVBC32-NEXT:    slli a7, a7, 3
+; RV32ZVBC32-NEXT:    mv t0, a7
+; RV32ZVBC32-NEXT:    slli a7, a7, 1
+; RV32ZVBC32-NEXT:    add t0, t0, a7
+; RV32ZVBC32-NEXT:    slli a7, a7, 1
+; RV32ZVBC32-NEXT:    add t0, t0, a7
+; RV32ZVBC32-NEXT:    slli a7, a7, 2
+; RV32ZVBC32-NEXT:    add a7, a7, t0
+; RV32ZVBC32-NEXT:    add a7, sp, a7
+; RV32ZVBC32-NEXT:    addi a7, a7, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a7) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v24, (a6), zero
+; RV32ZVBC32-NEXT:    csrr a6, vlenb
+; RV32ZVBC32-NEXT:    slli a6, a6, 7
+; RV32ZVBC32-NEXT:    add a6, sp, a6
+; RV32ZVBC32-NEXT:    addi a6, a6, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a6) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a6, vlenb
+; RV32ZVBC32-NEXT:    slli a6, a6, 4
+; RV32ZVBC32-NEXT:    mv a7, a6
+; RV32ZVBC32-NEXT:    slli a6, a6, 1
+; RV32ZVBC32-NEXT:    add a7, a7, a6
+; RV32ZVBC32-NEXT:    slli a6, a6, 2
+; RV32ZVBC32-NEXT:    add a6, a6, a7
+; RV32ZVBC32-NEXT:    add a6, sp, a6
+; RV32ZVBC32-NEXT:    addi a6, a6, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a6) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v16, v24
+; RV32ZVBC32-NEXT:    csrr a6, vlenb
+; RV32ZVBC32-NEXT:    slli a6, a6, 3
+; RV32ZVBC32-NEXT:    mv a7, a6
+; RV32ZVBC32-NEXT:    slli a6, a6, 1
+; RV32ZVBC32-NEXT:    add a6, a6, a7
+; RV32ZVBC32-NEXT:    add a6, sp, a6
+; RV32ZVBC32-NEXT:    addi a6, a6, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a6) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v24, v16, v0
+; RV32ZVBC32-NEXT:    csrr a6, vlenb
+; RV32ZVBC32-NEXT:    slli a6, a6, 4
+; RV32ZVBC32-NEXT:    mv a7, a6
+; RV32ZVBC32-NEXT:    slli a6, a6, 2
+; RV32ZVBC32-NEXT:    add a6, a6, a7
+; RV32ZVBC32-NEXT:    add a6, sp, a6
+; RV32ZVBC32-NEXT:    addi a6, a6, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a6) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a6, vlenb
+; RV32ZVBC32-NEXT:    slli a6, a6, 3
+; RV32ZVBC32-NEXT:    mv a7, a6
+; RV32ZVBC32-NEXT:    slli a6, a6, 1
+; RV32ZVBC32-NEXT:    add a7, a7, a6
+; RV32ZVBC32-NEXT:    slli a6, a6, 1
+; RV32ZVBC32-NEXT:    add a7, a7, a6
+; RV32ZVBC32-NEXT:    slli a6, a6, 2
+; RV32ZVBC32-NEXT:    add a6, a6, a7
+; RV32ZVBC32-NEXT:    add a6, sp, a6
+; RV32ZVBC32-NEXT:    addi a6, a6, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a6) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v16, v24
+; RV32ZVBC32-NEXT:    csrr a6, vlenb
+; RV32ZVBC32-NEXT:    slli a6, a6, 3
+; RV32ZVBC32-NEXT:    mv a7, a6
+; RV32ZVBC32-NEXT:    slli a6, a6, 4
+; RV32ZVBC32-NEXT:    add a6, a6, a7
+; RV32ZVBC32-NEXT:    add a6, sp, a6
+; RV32ZVBC32-NEXT:    addi a6, a6, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a6) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a6, vlenb
+; RV32ZVBC32-NEXT:    slli a6, a6, 7
+; RV32ZVBC32-NEXT:    add a6, sp, a6
+; RV32ZVBC32-NEXT:    addi a6, a6, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a6) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v16, v24
+; RV32ZVBC32-NEXT:    csrr a6, vlenb
+; RV32ZVBC32-NEXT:    slli a6, a6, 3
+; RV32ZVBC32-NEXT:    mv a7, a6
+; RV32ZVBC32-NEXT:    slli a6, a6, 1
+; RV32ZVBC32-NEXT:    add a7, a7, a6
+; RV32ZVBC32-NEXT:    slli a6, a6, 1
+; RV32ZVBC32-NEXT:    add a7, a7, a6
+; RV32ZVBC32-NEXT:    slli a6, a6, 2
+; RV32ZVBC32-NEXT:    add a6, a6, a7
+; RV32ZVBC32-NEXT:    add a6, sp, a6
+; RV32ZVBC32-NEXT:    addi a6, a6, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a6) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v24, (a5), zero
+; RV32ZVBC32-NEXT:    csrr a5, vlenb
+; RV32ZVBC32-NEXT:    slli a5, a5, 7
+; RV32ZVBC32-NEXT:    add a5, sp, a5
+; RV32ZVBC32-NEXT:    addi a5, a5, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a5) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v0, (a4), zero
+; RV32ZVBC32-NEXT:    vlse64.v v24, (a2), zero
+; RV32ZVBC32-NEXT:    csrr a2, vlenb
+; RV32ZVBC32-NEXT:    slli a2, a2, 4
+; RV32ZVBC32-NEXT:    mv a4, a2
+; RV32ZVBC32-NEXT:    slli a2, a2, 1
+; RV32ZVBC32-NEXT:    add a4, a4, a2
+; RV32ZVBC32-NEXT:    slli a2, a2, 2
+; RV32ZVBC32-NEXT:    add a2, a2, a4
+; RV32ZVBC32-NEXT:    add a2, sp, a2
+; RV32ZVBC32-NEXT:    addi a2, a2, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v24, (a0), zero
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a2, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a2, a2, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a2, a2, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a2
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 7
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v24, v16, v0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a2, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a2
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a2, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a2, a2, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a2
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 7
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a2, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a2, a2, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a2, a2, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a2
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a2, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a2, a2, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a2
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v24, (s11), zero
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v0, (a3), zero
+; RV32ZVBC32-NEXT:    vlse64.v v24, (a1), zero
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v24, (s8), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v24, v16, v0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi a0, sp, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v0, v16, v24
+; RV32ZVBC32-NEXT:    lui a0, 262144
+; RV32ZVBC32-NEXT:    vand.vx v16, v16, a0
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v24, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 8
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 7
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    vmul.vv v8, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add sp, sp, a0
+; RV32ZVBC32-NEXT:    lw ra, 348(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s0, 344(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s1, 340(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s2, 336(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s3, 332(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s4, 328(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s5, 324(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s6, 320(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s7, 316(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s8, 312(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s9, 308(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s10, 304(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s11, 300(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    addi sp, sp, 352
+; RV32ZVBC32-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmul_nxv8i64_vx:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV64ZVBC-NEXT:    vclmul.vx v8, v8, a0
-; RV64ZVBC-NEXT:    ret
+; RV64ZVBC32-LABEL: clmul_nxv8i64_vx:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    andi a1, a0, 2
+; RV64ZVBC32-NEXT:    andi a2, a0, 1
+; RV64ZVBC32-NEXT:    vsetvli a3, zero, e64, m8, ta, ma
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, a1
+; RV64ZVBC32-NEXT:    andi a1, a0, 4
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, a2
+; RV64ZVBC32-NEXT:    andi a2, a0, 8
+; RV64ZVBC32-NEXT:    vxor.vv v16, v24, v16
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, a1
+; RV64ZVBC32-NEXT:    andi a1, a0, 16
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, a2
+; RV64ZVBC32-NEXT:    andi a2, a0, 32
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, a1
+; RV64ZVBC32-NEXT:    andi a1, a0, 64
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, a2
+; RV64ZVBC32-NEXT:    andi a2, a0, 128
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, a1
+; RV64ZVBC32-NEXT:    andi a1, a0, 256
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, a2
+; RV64ZVBC32-NEXT:    andi a2, a0, 512
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, a1
+; RV64ZVBC32-NEXT:    andi a3, a0, 1024
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, a2
+; RV64ZVBC32-NEXT:    li a1, 1
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, a3
+; RV64ZVBC32-NEXT:    slli a2, a1, 11
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 1
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 2
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 4
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 8
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 16
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 32
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 64
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 128
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 256
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 512
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 1024
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 2048
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 4096
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 8192
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 16384
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 32768
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 65536
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 131072
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 262144
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, a2
+; RV64ZVBC32-NEXT:    srliw a2, a0, 31
+; RV64ZVBC32-NEXT:    slli a2, a2, 31
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 32
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 33
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 34
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 35
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 36
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 37
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 38
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 39
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 40
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 41
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 42
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 43
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 44
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 45
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 46
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 47
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 48
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 49
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 50
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 51
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 52
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 53
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 54
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 55
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 56
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 57
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 58
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 59
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 60
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 61
+; RV64ZVBC32-NEXT:    slli a1, a1, 62
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    and a1, a0, a1
+; RV64ZVBC32-NEXT:    srli a0, a0, 63
+; RV64ZVBC32-NEXT:    slli a0, a0, 63
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, a2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, a1
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v8, v8, a0
+; RV64ZVBC32-NEXT:    vxor.vv v8, v16, v8
+; RV64ZVBC32-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x i64> poison, i64 %b, i32 0
   %vb = shufflevector <vscale x 8 x i64> %elt.head, <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
   %v = call <vscale x 8 x i64> @llvm.clmul.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %vb)
@@ -15885,17 +29138,826 @@ define <vscale x 1 x i64> @clmul_nxv1i64_vv_mask(<vscale x 1 x i64> %va, <vscale
 ; RV64V-NEXT:    vxor.vv v8, v10, v9, v0.t
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmul_nxv1i64_vv_mask:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
-; RV32ZVBC-NEXT:    vclmul.vv v8, v8, v9, v0.t
-; RV32ZVBC-NEXT:    ret
+; RV32ZVBC64-LABEL: clmul_nxv1i64_vv_mask:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
+; RV32ZVBC64-NEXT:    vclmul.vv v8, v8, v9, v0.t
+; RV32ZVBC64-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmul_nxv1i64_vv_mask:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
-; RV64ZVBC-NEXT:    vclmul.vv v8, v8, v9, v0.t
-; RV64ZVBC-NEXT:    ret
+; RV64ZVBC64-LABEL: clmul_nxv1i64_vv_mask:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
+; RV64ZVBC64-NEXT:    vclmul.vv v8, v8, v9, v0.t
+; RV64ZVBC64-NEXT:    ret
+;
+; RV32ZVBC32-LABEL: clmul_nxv1i64_vv_mask:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    addi sp, sp, -352
+; RV32ZVBC32-NEXT:    .cfi_def_cfa_offset 352
+; RV32ZVBC32-NEXT:    sw ra, 348(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s0, 344(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s1, 340(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s2, 336(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s3, 332(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s4, 328(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s5, 324(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s6, 320(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s7, 316(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s8, 312(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s9, 308(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s10, 304(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s11, 300(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    .cfi_offset ra, -4
+; RV32ZVBC32-NEXT:    .cfi_offset s0, -8
+; RV32ZVBC32-NEXT:    .cfi_offset s1, -12
+; RV32ZVBC32-NEXT:    .cfi_offset s2, -16
+; RV32ZVBC32-NEXT:    .cfi_offset s3, -20
+; RV32ZVBC32-NEXT:    .cfi_offset s4, -24
+; RV32ZVBC32-NEXT:    .cfi_offset s5, -28
+; RV32ZVBC32-NEXT:    .cfi_offset s6, -32
+; RV32ZVBC32-NEXT:    .cfi_offset s7, -36
+; RV32ZVBC32-NEXT:    .cfi_offset s8, -40
+; RV32ZVBC32-NEXT:    .cfi_offset s9, -44
+; RV32ZVBC32-NEXT:    .cfi_offset s10, -48
+; RV32ZVBC32-NEXT:    .cfi_offset s11, -52
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a1, a0, 3
+; RV32ZVBC32-NEXT:    sub a0, a1, a0
+; RV32ZVBC32-NEXT:    sub sp, sp, a0
+; RV32ZVBC32-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xe0, 0x02, 0x22, 0x11, 0x07, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 352 + 7 * vlenb
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs1r.v v0, (a0) # vscale x 8-byte Folded Spill
+; RV32ZVBC32-NEXT:    lui a1, 524288
+; RV32ZVBC32-NEXT:    li t5, 1
+; RV32ZVBC32-NEXT:    li a4, 2
+; RV32ZVBC32-NEXT:    li a2, 4
+; RV32ZVBC32-NEXT:    li ra, 8
+; RV32ZVBC32-NEXT:    li a0, 16
+; RV32ZVBC32-NEXT:    li s11, 32
+; RV32ZVBC32-NEXT:    li s8, 64
+; RV32ZVBC32-NEXT:    li s9, 128
+; RV32ZVBC32-NEXT:    li s10, 256
+; RV32ZVBC32-NEXT:    li s6, 512
+; RV32ZVBC32-NEXT:    li s1, 1024
+; RV32ZVBC32-NEXT:    lui s7, 1
+; RV32ZVBC32-NEXT:    lui s5, 2
+; RV32ZVBC32-NEXT:    lui s4, 4
+; RV32ZVBC32-NEXT:    lui s3, 8
+; RV32ZVBC32-NEXT:    lui s2, 16
+; RV32ZVBC32-NEXT:    lui s0, 32
+; RV32ZVBC32-NEXT:    lui t6, 64
+; RV32ZVBC32-NEXT:    lui t4, 128
+; RV32ZVBC32-NEXT:    lui t3, 256
+; RV32ZVBC32-NEXT:    lui t2, 512
+; RV32ZVBC32-NEXT:    lui t1, 1024
+; RV32ZVBC32-NEXT:    lui t0, 2048
+; RV32ZVBC32-NEXT:    lui a7, 4096
+; RV32ZVBC32-NEXT:    lui a6, 8192
+; RV32ZVBC32-NEXT:    lui a5, 16384
+; RV32ZVBC32-NEXT:    lui a3, 32768
+; RV32ZVBC32-NEXT:    sw a1, 16(sp)
+; RV32ZVBC32-NEXT:    sw zero, 20(sp)
+; RV32ZVBC32-NEXT:    sw zero, 272(sp)
+; RV32ZVBC32-NEXT:    sw t5, 276(sp)
+; RV32ZVBC32-NEXT:    sw zero, 264(sp)
+; RV32ZVBC32-NEXT:    sw a4, 268(sp)
+; RV32ZVBC32-NEXT:    lui a4, 65536
+; RV32ZVBC32-NEXT:    sw zero, 256(sp)
+; RV32ZVBC32-NEXT:    sw a2, 260(sp)
+; RV32ZVBC32-NEXT:    lui a2, 131072
+; RV32ZVBC32-NEXT:    sw zero, 248(sp)
+; RV32ZVBC32-NEXT:    sw ra, 252(sp)
+; RV32ZVBC32-NEXT:    vsetvli ra, zero, e64, m1, ta, mu
+; RV32ZVBC32-NEXT:    vand.vi v13, v9, 2
+; RV32ZVBC32-NEXT:    vand.vi v14, v9, 1
+; RV32ZVBC32-NEXT:    vand.vi v12, v9, 4
+; RV32ZVBC32-NEXT:    vand.vi v11, v9, 8
+; RV32ZVBC32-NEXT:    sw zero, 240(sp)
+; RV32ZVBC32-NEXT:    sw a0, 244(sp)
+; RV32ZVBC32-NEXT:    vand.vx v0, v9, a0
+; RV32ZVBC32-NEXT:    addi ra, sp, 16
+; RV32ZVBC32-NEXT:    sw zero, 232(sp)
+; RV32ZVBC32-NEXT:    sw s11, 236(sp)
+; RV32ZVBC32-NEXT:    vand.vx v15, v9, s11
+; RV32ZVBC32-NEXT:    addi s11, sp, 272
+; RV32ZVBC32-NEXT:    sw zero, 224(sp)
+; RV32ZVBC32-NEXT:    sw s8, 228(sp)
+; RV32ZVBC32-NEXT:    vand.vx v16, v9, s8
+; RV32ZVBC32-NEXT:    addi s8, sp, 264
+; RV32ZVBC32-NEXT:    sw zero, 216(sp)
+; RV32ZVBC32-NEXT:    sw s9, 220(sp)
+; RV32ZVBC32-NEXT:    vand.vx v17, v9, s9
+; RV32ZVBC32-NEXT:    addi s9, sp, 256
+; RV32ZVBC32-NEXT:    sw zero, 208(sp)
+; RV32ZVBC32-NEXT:    sw s10, 212(sp)
+; RV32ZVBC32-NEXT:    vand.vx v18, v9, s10
+; RV32ZVBC32-NEXT:    addi s10, sp, 248
+; RV32ZVBC32-NEXT:    sw zero, 200(sp)
+; RV32ZVBC32-NEXT:    sw s6, 204(sp)
+; RV32ZVBC32-NEXT:    vand.vx v19, v9, s6
+; RV32ZVBC32-NEXT:    addi s6, sp, 240
+; RV32ZVBC32-NEXT:    sw zero, 192(sp)
+; RV32ZVBC32-NEXT:    sw s1, 196(sp)
+; RV32ZVBC32-NEXT:    vand.vx v20, v9, s1
+; RV32ZVBC32-NEXT:    slli t5, t5, 11
+; RV32ZVBC32-NEXT:    vand.vx v21, v9, s7
+; RV32ZVBC32-NEXT:    sw zero, 184(sp)
+; RV32ZVBC32-NEXT:    sw t5, 188(sp)
+; RV32ZVBC32-NEXT:    sw zero, 176(sp)
+; RV32ZVBC32-NEXT:    sw s7, 180(sp)
+; RV32ZVBC32-NEXT:    vand.vx v22, v9, s5
+; RV32ZVBC32-NEXT:    sw zero, 168(sp)
+; RV32ZVBC32-NEXT:    sw s5, 172(sp)
+; RV32ZVBC32-NEXT:    addi s7, sp, 216
+; RV32ZVBC32-NEXT:    vand.vx v23, v9, s4
+; RV32ZVBC32-NEXT:    sw zero, 160(sp)
+; RV32ZVBC32-NEXT:    sw s4, 164(sp)
+; RV32ZVBC32-NEXT:    addi s5, sp, 208
+; RV32ZVBC32-NEXT:    vand.vx v24, v9, s3
+; RV32ZVBC32-NEXT:    sw zero, 152(sp)
+; RV32ZVBC32-NEXT:    sw s3, 156(sp)
+; RV32ZVBC32-NEXT:    addi s4, sp, 200
+; RV32ZVBC32-NEXT:    vand.vx v25, v9, s2
+; RV32ZVBC32-NEXT:    sw zero, 144(sp)
+; RV32ZVBC32-NEXT:    sw s2, 148(sp)
+; RV32ZVBC32-NEXT:    addi s3, sp, 192
+; RV32ZVBC32-NEXT:    vand.vx v26, v9, s0
+; RV32ZVBC32-NEXT:    sw zero, 136(sp)
+; RV32ZVBC32-NEXT:    sw s0, 140(sp)
+; RV32ZVBC32-NEXT:    addi s2, sp, 184
+; RV32ZVBC32-NEXT:    vand.vx v27, v9, t6
+; RV32ZVBC32-NEXT:    sw zero, 128(sp)
+; RV32ZVBC32-NEXT:    sw t6, 132(sp)
+; RV32ZVBC32-NEXT:    addi s1, sp, 176
+; RV32ZVBC32-NEXT:    vand.vx v28, v9, t4
+; RV32ZVBC32-NEXT:    sw zero, 120(sp)
+; RV32ZVBC32-NEXT:    sw t4, 124(sp)
+; RV32ZVBC32-NEXT:    addi s0, sp, 168
+; RV32ZVBC32-NEXT:    vand.vx v29, v9, t3
+; RV32ZVBC32-NEXT:    sw zero, 112(sp)
+; RV32ZVBC32-NEXT:    sw t3, 116(sp)
+; RV32ZVBC32-NEXT:    addi t6, sp, 160
+; RV32ZVBC32-NEXT:    vand.vx v30, v9, t2
+; RV32ZVBC32-NEXT:    sw zero, 104(sp)
+; RV32ZVBC32-NEXT:    sw t2, 108(sp)
+; RV32ZVBC32-NEXT:    addi t4, sp, 152
+; RV32ZVBC32-NEXT:    vand.vx v31, v9, t1
+; RV32ZVBC32-NEXT:    sw zero, 96(sp)
+; RV32ZVBC32-NEXT:    sw t1, 100(sp)
+; RV32ZVBC32-NEXT:    addi t3, sp, 144
+; RV32ZVBC32-NEXT:    vand.vx v7, v9, t0
+; RV32ZVBC32-NEXT:    sw zero, 88(sp)
+; RV32ZVBC32-NEXT:    sw t0, 92(sp)
+; RV32ZVBC32-NEXT:    addi t2, sp, 136
+; RV32ZVBC32-NEXT:    vand.vx v6, v9, a7
+; RV32ZVBC32-NEXT:    sw zero, 80(sp)
+; RV32ZVBC32-NEXT:    sw a7, 84(sp)
+; RV32ZVBC32-NEXT:    addi t1, sp, 128
+; RV32ZVBC32-NEXT:    vand.vx v5, v9, a6
+; RV32ZVBC32-NEXT:    sw zero, 72(sp)
+; RV32ZVBC32-NEXT:    sw a6, 76(sp)
+; RV32ZVBC32-NEXT:    addi t0, sp, 120
+; RV32ZVBC32-NEXT:    vand.vx v4, v9, a5
+; RV32ZVBC32-NEXT:    sw zero, 64(sp)
+; RV32ZVBC32-NEXT:    sw a5, 68(sp)
+; RV32ZVBC32-NEXT:    addi a7, sp, 112
+; RV32ZVBC32-NEXT:    vand.vx v3, v9, a3
+; RV32ZVBC32-NEXT:    sw zero, 56(sp)
+; RV32ZVBC32-NEXT:    sw a3, 60(sp)
+; RV32ZVBC32-NEXT:    addi a6, sp, 104
+; RV32ZVBC32-NEXT:    vand.vx v2, v9, a4
+; RV32ZVBC32-NEXT:    sw zero, 48(sp)
+; RV32ZVBC32-NEXT:    sw a4, 52(sp)
+; RV32ZVBC32-NEXT:    addi a5, sp, 96
+; RV32ZVBC32-NEXT:    vand.vx v1, v9, a2
+; RV32ZVBC32-NEXT:    sw zero, 40(sp)
+; RV32ZVBC32-NEXT:    sw a2, 44(sp)
+; RV32ZVBC32-NEXT:    addi a4, sp, 88
+; RV32ZVBC32-NEXT:    sw zero, 32(sp)
+; RV32ZVBC32-NEXT:    lui a1, 262144
+; RV32ZVBC32-NEXT:    sw a1, 36(sp)
+; RV32ZVBC32-NEXT:    sw zero, 24(sp)
+; RV32ZVBC32-NEXT:    lui a0, 524288
+; RV32ZVBC32-NEXT:    sw a0, 28(sp)
+; RV32ZVBC32-NEXT:    addi a3, sp, 80
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v13
+; RV32ZVBC32-NEXT:    vmul.vv v13, v8, v14
+; RV32ZVBC32-NEXT:    vxor.vv v13, v13, v10
+; RV32ZVBC32-NEXT:    vand.vx v10, v9, t5
+; RV32ZVBC32-NEXT:    addi a2, sp, 72
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    vxor.vv v13, v13, v12
+; RV32ZVBC32-NEXT:    vlse64.v v12, (ra), zero
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli t5, a0, 2
+; RV32ZVBC32-NEXT:    add a0, t5, a0
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs1r.v v12, (a0) # vscale x 8-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi a0, sp, 64
+; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v13, v13, v11
+; RV32ZVBC32-NEXT:    vlse64.v v11, (s11), zero
+; RV32ZVBC32-NEXT:    csrr t5, vlenb
+; RV32ZVBC32-NEXT:    slli t5, t5, 2
+; RV32ZVBC32-NEXT:    add t5, sp, t5
+; RV32ZVBC32-NEXT:    addi t5, t5, 288
+; RV32ZVBC32-NEXT:    vs1r.v v11, (t5) # vscale x 8-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi s11, sp, 56
+; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v13, v13, v14
+; RV32ZVBC32-NEXT:    vlse64.v v0, (s8), zero
+; RV32ZVBC32-NEXT:    addi ra, sp, 48
+; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v15
+; RV32ZVBC32-NEXT:    vxor.vv v14, v13, v14
+; RV32ZVBC32-NEXT:    vlse64.v v12, (s9), zero
+; RV32ZVBC32-NEXT:    addi t5, sp, 40
+; RV32ZVBC32-NEXT:    vmul.vv v15, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v15, v14, v15
+; RV32ZVBC32-NEXT:    vlse64.v v11, (s10), zero
+; RV32ZVBC32-NEXT:    csrr s8, vlenb
+; RV32ZVBC32-NEXT:    slli s9, s8, 1
+; RV32ZVBC32-NEXT:    add s8, s9, s8
+; RV32ZVBC32-NEXT:    add s8, sp, s8
+; RV32ZVBC32-NEXT:    addi s8, s8, 288
+; RV32ZVBC32-NEXT:    vs1r.v v11, (s8) # vscale x 8-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi s8, sp, 32
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v17
+; RV32ZVBC32-NEXT:    vxor.vv v16, v15, v16
+; RV32ZVBC32-NEXT:    vlse64.v v11, (s6), zero
+; RV32ZVBC32-NEXT:    csrr s6, vlenb
+; RV32ZVBC32-NEXT:    slli s6, s6, 1
+; RV32ZVBC32-NEXT:    add s6, sp, s6
+; RV32ZVBC32-NEXT:    addi s6, s6, 288
+; RV32ZVBC32-NEXT:    vs1r.v v11, (s6) # vscale x 8-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi s6, sp, 24
+; RV32ZVBC32-NEXT:    vmul.vv v17, v8, v18
+; RV32ZVBC32-NEXT:    vmul.vv v18, v8, v19
+; RV32ZVBC32-NEXT:    vmul.vv v19, v8, v20
+; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v21
+; RV32ZVBC32-NEXT:    vmul.vv v21, v8, v22
+; RV32ZVBC32-NEXT:    vmul.vv v22, v8, v23
+; RV32ZVBC32-NEXT:    vmul.vv v23, v8, v24
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v25
+; RV32ZVBC32-NEXT:    vmul.vv v25, v8, v26
+; RV32ZVBC32-NEXT:    vmul.vv v26, v8, v27
+; RV32ZVBC32-NEXT:    vmul.vv v27, v8, v28
+; RV32ZVBC32-NEXT:    vmul.vv v28, v8, v29
+; RV32ZVBC32-NEXT:    vmul.vv v29, v8, v30
+; RV32ZVBC32-NEXT:    vmul.vv v30, v8, v31
+; RV32ZVBC32-NEXT:    vmul.vv v31, v8, v7
+; RV32ZVBC32-NEXT:    vmul.vv v7, v8, v6
+; RV32ZVBC32-NEXT:    vmul.vv v6, v8, v5
+; RV32ZVBC32-NEXT:    vmul.vv v5, v8, v4
+; RV32ZVBC32-NEXT:    vmul.vv v4, v8, v3
+; RV32ZVBC32-NEXT:    vmul.vv v3, v8, v2
+; RV32ZVBC32-NEXT:    vmul.vv v2, v8, v1
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v17
+; RV32ZVBC32-NEXT:    addi s9, sp, 232
+; RV32ZVBC32-NEXT:    vlse64.v v11, (s9), zero
+; RV32ZVBC32-NEXT:    csrr s9, vlenb
+; RV32ZVBC32-NEXT:    add s9, sp, s9
+; RV32ZVBC32-NEXT:    addi s9, s9, 288
+; RV32ZVBC32-NEXT:    vs1r.v v11, (s9) # vscale x 8-byte Folded Spill
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v18
+; RV32ZVBC32-NEXT:    addi s9, sp, 224
+; RV32ZVBC32-NEXT:    vlse64.v v11, (s9), zero
+; RV32ZVBC32-NEXT:    addi s9, sp, 288
+; RV32ZVBC32-NEXT:    vs1r.v v11, (s9) # vscale x 8-byte Folded Spill
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v19
+; RV32ZVBC32-NEXT:    vlse64.v v19, (s7), zero
+; RV32ZVBC32-NEXT:    vxor.vv v10, v16, v10
+; RV32ZVBC32-NEXT:    vlse64.v v13, (s5), zero
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v20
+; RV32ZVBC32-NEXT:    vlse64.v v20, (s4), zero
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v21
+; RV32ZVBC32-NEXT:    vlse64.v v21, (s3), zero
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v22
+; RV32ZVBC32-NEXT:    vlse64.v v22, (s2), zero
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v23
+; RV32ZVBC32-NEXT:    vlse64.v v23, (s1), zero
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v24
+; RV32ZVBC32-NEXT:    vlse64.v v24, (s0), zero
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v25
+; RV32ZVBC32-NEXT:    vlse64.v v25, (t6), zero
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v26
+; RV32ZVBC32-NEXT:    vlse64.v v26, (t4), zero
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v27
+; RV32ZVBC32-NEXT:    vlse64.v v27, (t3), zero
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v28
+; RV32ZVBC32-NEXT:    vlse64.v v28, (t2), zero
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v29
+; RV32ZVBC32-NEXT:    vlse64.v v29, (t1), zero
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v30
+; RV32ZVBC32-NEXT:    vlse64.v v30, (t0), zero
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v31
+; RV32ZVBC32-NEXT:    vlse64.v v31, (a7), zero
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v7
+; RV32ZVBC32-NEXT:    vlse64.v v7, (a6), zero
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v6
+; RV32ZVBC32-NEXT:    vlse64.v v6, (a5), zero
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v5
+; RV32ZVBC32-NEXT:    vlse64.v v5, (a4), zero
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v4
+; RV32ZVBC32-NEXT:    vlse64.v v4, (a3), zero
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v3
+; RV32ZVBC32-NEXT:    vlse64.v v3, (a2), zero
+; RV32ZVBC32-NEXT:    vxor.vv v2, v10, v2
+; RV32ZVBC32-NEXT:    vlse64.v v11, (a0), zero
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a2, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a2, a0
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v1, v9, v10
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v14, v9, v10
+; RV32ZVBC32-NEXT:    vand.vv v0, v9, v0
+; RV32ZVBC32-NEXT:    vand.vv v15, v9, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a2, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a2, a0
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v16, v9, v10
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v17, v9, v10
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v18, v9, v10
+; RV32ZVBC32-NEXT:    addi a0, sp, 288
+; RV32ZVBC32-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v10, v9, v10
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v19, v9, v19
+; RV32ZVBC32-NEXT:    vand.vv v10, v9, v13
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v20, v9, v20
+; RV32ZVBC32-NEXT:    vand.vv v21, v9, v21
+; RV32ZVBC32-NEXT:    vand.vv v22, v9, v22
+; RV32ZVBC32-NEXT:    vand.vv v23, v9, v23
+; RV32ZVBC32-NEXT:    vand.vv v24, v9, v24
+; RV32ZVBC32-NEXT:    vand.vv v25, v9, v25
+; RV32ZVBC32-NEXT:    vand.vv v26, v9, v26
+; RV32ZVBC32-NEXT:    vand.vv v27, v9, v27
+; RV32ZVBC32-NEXT:    vand.vv v28, v9, v28
+; RV32ZVBC32-NEXT:    vand.vv v29, v9, v29
+; RV32ZVBC32-NEXT:    vand.vv v30, v9, v30
+; RV32ZVBC32-NEXT:    vand.vv v31, v9, v31
+; RV32ZVBC32-NEXT:    vand.vv v7, v9, v7
+; RV32ZVBC32-NEXT:    vand.vv v6, v9, v6
+; RV32ZVBC32-NEXT:    vand.vv v5, v9, v5
+; RV32ZVBC32-NEXT:    vlse64.v v10, (s11), zero
+; RV32ZVBC32-NEXT:    vand.vv v4, v9, v4
+; RV32ZVBC32-NEXT:    vand.vv v3, v9, v3
+; RV32ZVBC32-NEXT:    vand.vv v11, v9, v11
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs1r.v v11, (a0) # vscale x 8-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v10, v9, v10
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a2, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a2, a0
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v10, (ra), zero
+; RV32ZVBC32-NEXT:    vlse64.v v11, (t5), zero
+; RV32ZVBC32-NEXT:    vlse64.v v12, (s8), zero
+; RV32ZVBC32-NEXT:    vlse64.v v13, (s6), zero
+; RV32ZVBC32-NEXT:    vand.vv v10, v9, v10
+; RV32ZVBC32-NEXT:    vand.vv v11, v9, v11
+; RV32ZVBC32-NEXT:    vand.vv v12, v9, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a2, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a2, a0
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs1r.v v12, (a0) # vscale x 8-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v13, v9, v13
+; RV32ZVBC32-NEXT:    vand.vx v9, v9, a1
+; RV32ZVBC32-NEXT:    vmul.vv v9, v8, v9
+; RV32ZVBC32-NEXT:    vxor.vv v9, v2, v9
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v1
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v12
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v14
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v12
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v12
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v15
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v12
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v12
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v17
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v12
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v18
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl1r.v v12, (a0) # vscale x 8-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v12
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v19
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl1r.v v12, (a0) # vscale x 8-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v12
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v20
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v12
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v21
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v12
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v22
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v12
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v23
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v12
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v12
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v25
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v12
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v26
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v12
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v27
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v12
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v28
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v12
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v29
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v12
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v30
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v12
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v31
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v12
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v7
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v12
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v6
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v12
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v5
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v12
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v4
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v12
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v3
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl1r.v v12, (a0) # vscale x 8-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a1, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a1, a0
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl1r.v v12, (a0) # vscale x 8-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v12
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a1, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a1, a0
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v13
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v9, v10, v0.t
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a1, a0, 3
+; RV32ZVBC32-NEXT:    sub a0, a1, a0
+; RV32ZVBC32-NEXT:    add sp, sp, a0
+; RV32ZVBC32-NEXT:    .cfi_def_cfa sp, 352
+; RV32ZVBC32-NEXT:    lw ra, 348(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s0, 344(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s1, 340(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s2, 336(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s3, 332(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s4, 328(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s5, 324(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s6, 320(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s7, 316(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s8, 312(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s9, 308(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s10, 304(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s11, 300(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    .cfi_restore ra
+; RV32ZVBC32-NEXT:    .cfi_restore s0
+; RV32ZVBC32-NEXT:    .cfi_restore s1
+; RV32ZVBC32-NEXT:    .cfi_restore s2
+; RV32ZVBC32-NEXT:    .cfi_restore s3
+; RV32ZVBC32-NEXT:    .cfi_restore s4
+; RV32ZVBC32-NEXT:    .cfi_restore s5
+; RV32ZVBC32-NEXT:    .cfi_restore s6
+; RV32ZVBC32-NEXT:    .cfi_restore s7
+; RV32ZVBC32-NEXT:    .cfi_restore s8
+; RV32ZVBC32-NEXT:    .cfi_restore s9
+; RV32ZVBC32-NEXT:    .cfi_restore s10
+; RV32ZVBC32-NEXT:    .cfi_restore s11
+; RV32ZVBC32-NEXT:    addi sp, sp, 352
+; RV32ZVBC32-NEXT:    .cfi_def_cfa_offset 0
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmul_nxv1i64_vv_mask:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
+; RV64ZVBC32-NEXT:    vand.vi v10, v9, 2
+; RV64ZVBC32-NEXT:    vand.vi v11, v9, 1
+; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v11, v10
+; RV64ZVBC32-NEXT:    vand.vi v11, v9, 4
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vi v11, v9, 8
+; RV64ZVBC32-NEXT:    li a0, 16
+; RV64ZVBC32-NEXT:    li a1, 32
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 64
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    li a1, 128
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a0, 256
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    li a1, 512
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    li a2, 1024
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    li a0, 1
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a2
+; RV64ZVBC32-NEXT:    slli a1, a0, 11
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    lui a1, 1
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    lui a1, 2
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    lui a1, 4
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    lui a1, 8
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    lui a1, 16
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    lui a1, 32
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    lui a1, 64
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    lui a1, 128
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    lui a1, 256
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    lui a1, 512
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    lui a1, 1024
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    lui a1, 2048
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    lui a1, 4096
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    lui a1, 8192
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    lui a1, 16384
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    lui a1, 32768
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    lui a1, 65536
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    lui a1, 131072
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    lui a1, 262144
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 31
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 32
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 33
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 34
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 35
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 36
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 37
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 38
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 39
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 40
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 41
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 42
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 43
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 44
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 45
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 46
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 47
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 48
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 49
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 50
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 51
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 52
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 53
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 54
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 55
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 56
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 57
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 58
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 59
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 60
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    slli a1, a0, 61
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a1
+; RV64ZVBC32-NEXT:    li a1, -1
+; RV64ZVBC32-NEXT:    slli a0, a0, 62
+; RV64ZVBC32-NEXT:    slli a1, a1, 63
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
+; RV64ZVBC32-NEXT:    vand.vx v9, v9, a1
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vmul.vv v9, v8, v9
+; RV64ZVBC32-NEXT:    vxor.vv v8, v10, v9, v0.t
+; RV64ZVBC32-NEXT:    ret
   %v = call <vscale x 1 x i64> @llvm.clmul.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %vb)
   %w = select <vscale x 1 x i1> %mask, <vscale x 1 x i64> %v, <vscale x 1 x i64> %va
   ret <vscale x 1 x i64> %w
@@ -16673,28 +30735,802 @@ define <vscale x 1 x i64> @clmul_nxv1i64_vx_mask(<vscale x 1 x i64> %va, i64 %b,
 ; RV64V-NEXT:    vxor.vv v8, v9, v10, v0.t
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmul_nxv1i64_vx_mask:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    addi sp, sp, -16
-; RV32ZVBC-NEXT:    .cfi_def_cfa_offset 16
-; RV32ZVBC-NEXT:    sw a0, 8(sp)
-; RV32ZVBC-NEXT:    sw a1, 12(sp)
-; RV32ZVBC-NEXT:    addi a0, sp, 8
-; RV32ZVBC-NEXT:    vsetvli a1, zero, e64, m1, ta, mu
-; RV32ZVBC-NEXT:    vlse64.v v9, (a0), zero
-; RV32ZVBC-NEXT:    vclmul.vv v8, v8, v9, v0.t
-; RV32ZVBC-NEXT:    addi sp, sp, 16
-; RV32ZVBC-NEXT:    .cfi_def_cfa_offset 0
-; RV32ZVBC-NEXT:    ret
+; RV32ZVBC64-LABEL: clmul_nxv1i64_vx_mask:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    addi sp, sp, -16
+; RV32ZVBC64-NEXT:    .cfi_def_cfa_offset 16
+; RV32ZVBC64-NEXT:    sw a0, 8(sp)
+; RV32ZVBC64-NEXT:    sw a1, 12(sp)
+; RV32ZVBC64-NEXT:    addi a0, sp, 8
+; RV32ZVBC64-NEXT:    vsetvli a1, zero, e64, m1, ta, mu
+; RV32ZVBC64-NEXT:    vlse64.v v9, (a0), zero
+; RV32ZVBC64-NEXT:    vclmul.vv v8, v8, v9, v0.t
+; RV32ZVBC64-NEXT:    addi sp, sp, 16
+; RV32ZVBC64-NEXT:    .cfi_def_cfa_offset 0
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmul_nxv1i64_vx_mask:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a1, zero, e64, m1, ta, mu
+; RV64ZVBC64-NEXT:    vclmul.vx v8, v8, a0, v0.t
+; RV64ZVBC64-NEXT:    ret
+;
+; RV32ZVBC32-LABEL: clmul_nxv1i64_vx_mask:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    addi sp, sp, -352
+; RV32ZVBC32-NEXT:    .cfi_def_cfa_offset 352
+; RV32ZVBC32-NEXT:    sw ra, 348(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s0, 344(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s1, 340(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s2, 336(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s3, 332(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s4, 328(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s5, 324(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s6, 320(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s7, 316(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s8, 312(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s9, 308(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s10, 304(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s11, 300(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    .cfi_offset ra, -4
+; RV32ZVBC32-NEXT:    .cfi_offset s0, -8
+; RV32ZVBC32-NEXT:    .cfi_offset s1, -12
+; RV32ZVBC32-NEXT:    .cfi_offset s2, -16
+; RV32ZVBC32-NEXT:    .cfi_offset s3, -20
+; RV32ZVBC32-NEXT:    .cfi_offset s4, -24
+; RV32ZVBC32-NEXT:    .cfi_offset s5, -28
+; RV32ZVBC32-NEXT:    .cfi_offset s6, -32
+; RV32ZVBC32-NEXT:    .cfi_offset s7, -36
+; RV32ZVBC32-NEXT:    .cfi_offset s8, -40
+; RV32ZVBC32-NEXT:    .cfi_offset s9, -44
+; RV32ZVBC32-NEXT:    .cfi_offset s10, -48
+; RV32ZVBC32-NEXT:    .cfi_offset s11, -52
+; RV32ZVBC32-NEXT:    csrr a2, vlenb
+; RV32ZVBC32-NEXT:    slli a2, a2, 1
+; RV32ZVBC32-NEXT:    mv a3, a2
+; RV32ZVBC32-NEXT:    slli a2, a2, 1
+; RV32ZVBC32-NEXT:    add a2, a2, a3
+; RV32ZVBC32-NEXT:    sub sp, sp, a2
+; RV32ZVBC32-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xe0, 0x02, 0x22, 0x11, 0x06, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 352 + 6 * vlenb
+; RV32ZVBC32-NEXT:    csrr a2, vlenb
+; RV32ZVBC32-NEXT:    slli a3, a2, 2
+; RV32ZVBC32-NEXT:    add a2, a3, a2
+; RV32ZVBC32-NEXT:    add a2, sp, a2
+; RV32ZVBC32-NEXT:    addi a2, a2, 288
+; RV32ZVBC32-NEXT:    vs1r.v v0, (a2) # vscale x 8-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw a0, 8(sp)
+; RV32ZVBC32-NEXT:    sw a1, 12(sp)
+; RV32ZVBC32-NEXT:    addi s3, sp, 8
+; RV32ZVBC32-NEXT:    lui s2, 524288
+; RV32ZVBC32-NEXT:    li s11, 1
+; RV32ZVBC32-NEXT:    li s5, 2
+; RV32ZVBC32-NEXT:    li s7, 4
+; RV32ZVBC32-NEXT:    li s10, 8
+; RV32ZVBC32-NEXT:    li ra, 64
+; RV32ZVBC32-NEXT:    li s9, 128
+; RV32ZVBC32-NEXT:    li s8, 256
+; RV32ZVBC32-NEXT:    li s6, 512
+; RV32ZVBC32-NEXT:    li s4, 1024
+; RV32ZVBC32-NEXT:    lui s1, 1
+; RV32ZVBC32-NEXT:    lui s0, 2
+; RV32ZVBC32-NEXT:    lui t6, 4
+; RV32ZVBC32-NEXT:    lui t5, 8
+; RV32ZVBC32-NEXT:    lui t4, 16
+; RV32ZVBC32-NEXT:    lui t3, 32
+; RV32ZVBC32-NEXT:    lui t2, 64
+; RV32ZVBC32-NEXT:    lui t1, 128
+; RV32ZVBC32-NEXT:    lui t0, 256
+; RV32ZVBC32-NEXT:    lui a6, 512
+; RV32ZVBC32-NEXT:    lui a5, 1024
+; RV32ZVBC32-NEXT:    lui a4, 2048
+; RV32ZVBC32-NEXT:    lui a3, 4096
+; RV32ZVBC32-NEXT:    lui a2, 8192
+; RV32ZVBC32-NEXT:    lui a0, 16384
+; RV32ZVBC32-NEXT:    vsetvli a1, zero, e64, m1, ta, mu
+; RV32ZVBC32-NEXT:    vlse64.v v9, (s3), zero
+; RV32ZVBC32-NEXT:    lui s3, 32768
+; RV32ZVBC32-NEXT:    sw s2, 16(sp)
+; RV32ZVBC32-NEXT:    lui a7, 524288
+; RV32ZVBC32-NEXT:    sw zero, 20(sp)
+; RV32ZVBC32-NEXT:    sw zero, 272(sp)
+; RV32ZVBC32-NEXT:    sw s11, 276(sp)
+; RV32ZVBC32-NEXT:    sw zero, 264(sp)
+; RV32ZVBC32-NEXT:    sw s5, 268(sp)
+; RV32ZVBC32-NEXT:    lui s5, 65536
+; RV32ZVBC32-NEXT:    sw zero, 256(sp)
+; RV32ZVBC32-NEXT:    sw s7, 260(sp)
+; RV32ZVBC32-NEXT:    lui s7, 131072
+; RV32ZVBC32-NEXT:    sw zero, 248(sp)
+; RV32ZVBC32-NEXT:    sw s10, 252(sp)
+; RV32ZVBC32-NEXT:    lui a1, 262144
+; RV32ZVBC32-NEXT:    sw zero, 240(sp)
+; RV32ZVBC32-NEXT:    li s2, 16
+; RV32ZVBC32-NEXT:    sw s2, 244(sp)
+; RV32ZVBC32-NEXT:    li s10, 16
+; RV32ZVBC32-NEXT:    sw zero, 232(sp)
+; RV32ZVBC32-NEXT:    li s2, 32
+; RV32ZVBC32-NEXT:    sw s2, 236(sp)
+; RV32ZVBC32-NEXT:    sw zero, 224(sp)
+; RV32ZVBC32-NEXT:    sw ra, 228(sp)
+; RV32ZVBC32-NEXT:    sw zero, 216(sp)
+; RV32ZVBC32-NEXT:    sw s9, 220(sp)
+; RV32ZVBC32-NEXT:    sw zero, 208(sp)
+; RV32ZVBC32-NEXT:    sw s8, 212(sp)
+; RV32ZVBC32-NEXT:    li s2, 256
+; RV32ZVBC32-NEXT:    sw zero, 200(sp)
+; RV32ZVBC32-NEXT:    sw s6, 204(sp)
+; RV32ZVBC32-NEXT:    sw zero, 192(sp)
+; RV32ZVBC32-NEXT:    sw s4, 196(sp)
+; RV32ZVBC32-NEXT:    slli s11, s11, 11
+; RV32ZVBC32-NEXT:    sw zero, 184(sp)
+; RV32ZVBC32-NEXT:    sw s11, 188(sp)
+; RV32ZVBC32-NEXT:    sw zero, 176(sp)
+; RV32ZVBC32-NEXT:    sw s1, 180(sp)
+; RV32ZVBC32-NEXT:    sw zero, 168(sp)
+; RV32ZVBC32-NEXT:    sw s0, 172(sp)
+; RV32ZVBC32-NEXT:    sw zero, 160(sp)
+; RV32ZVBC32-NEXT:    sw t6, 164(sp)
+; RV32ZVBC32-NEXT:    sw zero, 152(sp)
+; RV32ZVBC32-NEXT:    sw t5, 156(sp)
+; RV32ZVBC32-NEXT:    lui s1, 8
+; RV32ZVBC32-NEXT:    sw zero, 144(sp)
+; RV32ZVBC32-NEXT:    sw t4, 148(sp)
+; RV32ZVBC32-NEXT:    lui s0, 16
+; RV32ZVBC32-NEXT:    sw zero, 136(sp)
+; RV32ZVBC32-NEXT:    sw t3, 140(sp)
+; RV32ZVBC32-NEXT:    lui t5, 32
+; RV32ZVBC32-NEXT:    sw zero, 128(sp)
+; RV32ZVBC32-NEXT:    sw t2, 132(sp)
+; RV32ZVBC32-NEXT:    lui t4, 64
+; RV32ZVBC32-NEXT:    sw zero, 120(sp)
+; RV32ZVBC32-NEXT:    sw t1, 124(sp)
+; RV32ZVBC32-NEXT:    lui t3, 128
+; RV32ZVBC32-NEXT:    sw zero, 112(sp)
+; RV32ZVBC32-NEXT:    sw t0, 116(sp)
+; RV32ZVBC32-NEXT:    lui t2, 256
+; RV32ZVBC32-NEXT:    sw zero, 104(sp)
+; RV32ZVBC32-NEXT:    sw a6, 108(sp)
+; RV32ZVBC32-NEXT:    lui t0, 512
+; RV32ZVBC32-NEXT:    sw zero, 96(sp)
+; RV32ZVBC32-NEXT:    sw a5, 100(sp)
+; RV32ZVBC32-NEXT:    lui a6, 1024
+; RV32ZVBC32-NEXT:    sw zero, 88(sp)
+; RV32ZVBC32-NEXT:    sw a4, 92(sp)
+; RV32ZVBC32-NEXT:    lui a5, 2048
+; RV32ZVBC32-NEXT:    sw zero, 80(sp)
+; RV32ZVBC32-NEXT:    sw a3, 84(sp)
+; RV32ZVBC32-NEXT:    lui t1, 4096
+; RV32ZVBC32-NEXT:    sw zero, 72(sp)
+; RV32ZVBC32-NEXT:    sw a2, 76(sp)
+; RV32ZVBC32-NEXT:    sw zero, 64(sp)
+; RV32ZVBC32-NEXT:    sw a0, 68(sp)
+; RV32ZVBC32-NEXT:    sw zero, 56(sp)
+; RV32ZVBC32-NEXT:    sw s3, 60(sp)
+; RV32ZVBC32-NEXT:    sw zero, 48(sp)
+; RV32ZVBC32-NEXT:    sw s5, 52(sp)
+; RV32ZVBC32-NEXT:    sw zero, 40(sp)
+; RV32ZVBC32-NEXT:    sw s7, 44(sp)
+; RV32ZVBC32-NEXT:    sw zero, 32(sp)
+; RV32ZVBC32-NEXT:    sw a1, 36(sp)
+; RV32ZVBC32-NEXT:    lui a3, 262144
+; RV32ZVBC32-NEXT:    sw zero, 24(sp)
+; RV32ZVBC32-NEXT:    sw a7, 28(sp)
+; RV32ZVBC32-NEXT:    addi a1, sp, 16
+; RV32ZVBC32-NEXT:    vlse64.v v12, (a1), zero
+; RV32ZVBC32-NEXT:    addi a1, sp, 272
+; RV32ZVBC32-NEXT:    vlse64.v v3, (a1), zero
+; RV32ZVBC32-NEXT:    addi a1, sp, 264
+; RV32ZVBC32-NEXT:    vlse64.v v10, (a1), zero
+; RV32ZVBC32-NEXT:    addi a1, sp, 256
+; RV32ZVBC32-NEXT:    vlse64.v v15, (a1), zero
+; RV32ZVBC32-NEXT:    addi a1, sp, 248
+; RV32ZVBC32-NEXT:    vlse64.v v16, (a1), zero
+; RV32ZVBC32-NEXT:    addi a1, sp, 240
+; RV32ZVBC32-NEXT:    vlse64.v v17, (a1), zero
+; RV32ZVBC32-NEXT:    addi a1, sp, 232
+; RV32ZVBC32-NEXT:    vlse64.v v18, (a1), zero
+; RV32ZVBC32-NEXT:    addi a1, sp, 224
+; RV32ZVBC32-NEXT:    vlse64.v v19, (a1), zero
+; RV32ZVBC32-NEXT:    addi a1, sp, 216
+; RV32ZVBC32-NEXT:    vlse64.v v20, (a1), zero
+; RV32ZVBC32-NEXT:    addi a1, sp, 208
+; RV32ZVBC32-NEXT:    vlse64.v v21, (a1), zero
+; RV32ZVBC32-NEXT:    addi a1, sp, 200
+; RV32ZVBC32-NEXT:    vlse64.v v22, (a1), zero
+; RV32ZVBC32-NEXT:    addi a1, sp, 192
+; RV32ZVBC32-NEXT:    vlse64.v v23, (a1), zero
+; RV32ZVBC32-NEXT:    addi a1, sp, 184
+; RV32ZVBC32-NEXT:    vlse64.v v24, (a1), zero
+; RV32ZVBC32-NEXT:    addi a1, sp, 176
+; RV32ZVBC32-NEXT:    vlse64.v v25, (a1), zero
+; RV32ZVBC32-NEXT:    addi a1, sp, 168
+; RV32ZVBC32-NEXT:    vlse64.v v26, (a1), zero
+; RV32ZVBC32-NEXT:    addi a1, sp, 160
+; RV32ZVBC32-NEXT:    vlse64.v v27, (a1), zero
+; RV32ZVBC32-NEXT:    addi a1, sp, 152
+; RV32ZVBC32-NEXT:    vlse64.v v28, (a1), zero
+; RV32ZVBC32-NEXT:    addi a1, sp, 144
+; RV32ZVBC32-NEXT:    vlse64.v v29, (a1), zero
+; RV32ZVBC32-NEXT:    addi a1, sp, 136
+; RV32ZVBC32-NEXT:    vlse64.v v30, (a1), zero
+; RV32ZVBC32-NEXT:    addi a1, sp, 128
+; RV32ZVBC32-NEXT:    vlse64.v v31, (a1), zero
+; RV32ZVBC32-NEXT:    addi a1, sp, 120
+; RV32ZVBC32-NEXT:    vlse64.v v7, (a1), zero
+; RV32ZVBC32-NEXT:    addi a1, sp, 112
+; RV32ZVBC32-NEXT:    vlse64.v v6, (a1), zero
+; RV32ZVBC32-NEXT:    addi a1, sp, 104
+; RV32ZVBC32-NEXT:    vlse64.v v5, (a1), zero
+; RV32ZVBC32-NEXT:    addi a1, sp, 96
+; RV32ZVBC32-NEXT:    vlse64.v v4, (a1), zero
+; RV32ZVBC32-NEXT:    addi a1, sp, 88
+; RV32ZVBC32-NEXT:    vlse64.v v0, (a1), zero
+; RV32ZVBC32-NEXT:    addi a1, sp, 80
+; RV32ZVBC32-NEXT:    vlse64.v v11, (a1), zero
+; RV32ZVBC32-NEXT:    addi a1, sp, 72
+; RV32ZVBC32-NEXT:    vlse64.v v13, (a1), zero
+; RV32ZVBC32-NEXT:    csrr a1, vlenb
+; RV32ZVBC32-NEXT:    slli a1, a1, 2
+; RV32ZVBC32-NEXT:    add a1, sp, a1
+; RV32ZVBC32-NEXT:    addi a1, a1, 288
+; RV32ZVBC32-NEXT:    vs1r.v v13, (a1) # vscale x 8-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi ra, sp, 64
+; RV32ZVBC32-NEXT:    vand.vi v2, v9, 2
+; RV32ZVBC32-NEXT:    vand.vi v1, v9, 1
+; RV32ZVBC32-NEXT:    vmul.vv v2, v8, v2
+; RV32ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV32ZVBC32-NEXT:    vxor.vv v2, v1, v2
+; RV32ZVBC32-NEXT:    vand.vi v1, v9, 4
+; RV32ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV32ZVBC32-NEXT:    vxor.vv v2, v2, v1
+; RV32ZVBC32-NEXT:    vand.vi v1, v9, 8
+; RV32ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV32ZVBC32-NEXT:    vxor.vv v2, v2, v1
+; RV32ZVBC32-NEXT:    vand.vx v1, v9, s10
+; RV32ZVBC32-NEXT:    addi s10, sp, 56
+; RV32ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV32ZVBC32-NEXT:    vxor.vv v2, v2, v1
+; RV32ZVBC32-NEXT:    li a1, 32
+; RV32ZVBC32-NEXT:    vand.vx v1, v9, a1
+; RV32ZVBC32-NEXT:    addi s9, sp, 48
+; RV32ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV32ZVBC32-NEXT:    vxor.vv v2, v2, v1
+; RV32ZVBC32-NEXT:    li a1, 64
+; RV32ZVBC32-NEXT:    vand.vx v1, v9, a1
+; RV32ZVBC32-NEXT:    addi s8, sp, 40
+; RV32ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV32ZVBC32-NEXT:    vxor.vv v2, v2, v1
+; RV32ZVBC32-NEXT:    li a1, 128
+; RV32ZVBC32-NEXT:    vand.vx v1, v9, a1
+; RV32ZVBC32-NEXT:    addi s6, sp, 32
+; RV32ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV32ZVBC32-NEXT:    vxor.vv v2, v2, v1
+; RV32ZVBC32-NEXT:    vand.vx v1, v9, s2
+; RV32ZVBC32-NEXT:    addi s4, sp, 24
+; RV32ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV32ZVBC32-NEXT:    vxor.vv v2, v2, v1
+; RV32ZVBC32-NEXT:    li a1, 512
+; RV32ZVBC32-NEXT:    vand.vx v1, v9, a1
+; RV32ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV32ZVBC32-NEXT:    vxor.vv v2, v2, v1
+; RV32ZVBC32-NEXT:    li a1, 1024
+; RV32ZVBC32-NEXT:    vand.vx v1, v9, a1
+; RV32ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV32ZVBC32-NEXT:    vxor.vv v2, v2, v1
+; RV32ZVBC32-NEXT:    vand.vx v1, v9, s11
+; RV32ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV32ZVBC32-NEXT:    vxor.vv v2, v2, v1
+; RV32ZVBC32-NEXT:    lui a1, 1
+; RV32ZVBC32-NEXT:    vand.vx v1, v9, a1
+; RV32ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV32ZVBC32-NEXT:    vxor.vv v2, v2, v1
+; RV32ZVBC32-NEXT:    lui a1, 2
+; RV32ZVBC32-NEXT:    vand.vx v1, v9, a1
+; RV32ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV32ZVBC32-NEXT:    vxor.vv v2, v2, v1
+; RV32ZVBC32-NEXT:    vand.vx v1, v9, t6
+; RV32ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV32ZVBC32-NEXT:    vxor.vv v2, v2, v1
+; RV32ZVBC32-NEXT:    vand.vx v1, v9, s1
+; RV32ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV32ZVBC32-NEXT:    vxor.vv v2, v2, v1
+; RV32ZVBC32-NEXT:    vand.vx v1, v9, s0
+; RV32ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV32ZVBC32-NEXT:    vxor.vv v2, v2, v1
+; RV32ZVBC32-NEXT:    vand.vx v1, v9, t5
+; RV32ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV32ZVBC32-NEXT:    vxor.vv v2, v2, v1
+; RV32ZVBC32-NEXT:    vand.vx v1, v9, t4
+; RV32ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV32ZVBC32-NEXT:    vxor.vv v2, v2, v1
+; RV32ZVBC32-NEXT:    vand.vx v1, v9, t3
+; RV32ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV32ZVBC32-NEXT:    vxor.vv v2, v2, v1
+; RV32ZVBC32-NEXT:    vand.vx v1, v9, t2
+; RV32ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV32ZVBC32-NEXT:    vxor.vv v2, v2, v1
+; RV32ZVBC32-NEXT:    vand.vx v1, v9, t0
+; RV32ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV32ZVBC32-NEXT:    vxor.vv v2, v2, v1
+; RV32ZVBC32-NEXT:    vand.vx v1, v9, a6
+; RV32ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV32ZVBC32-NEXT:    vxor.vv v2, v2, v1
+; RV32ZVBC32-NEXT:    vand.vx v1, v9, a5
+; RV32ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV32ZVBC32-NEXT:    vxor.vv v2, v2, v1
+; RV32ZVBC32-NEXT:    vand.vx v1, v9, t1
+; RV32ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV32ZVBC32-NEXT:    vxor.vv v2, v2, v1
+; RV32ZVBC32-NEXT:    vand.vx v1, v9, a2
+; RV32ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV32ZVBC32-NEXT:    vxor.vv v2, v2, v1
+; RV32ZVBC32-NEXT:    vand.vx v1, v9, a0
+; RV32ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV32ZVBC32-NEXT:    vxor.vv v2, v2, v1
+; RV32ZVBC32-NEXT:    vand.vx v1, v9, s3
+; RV32ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV32ZVBC32-NEXT:    vxor.vv v2, v2, v1
+; RV32ZVBC32-NEXT:    vand.vx v1, v9, s5
+; RV32ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV32ZVBC32-NEXT:    vxor.vv v2, v2, v1
+; RV32ZVBC32-NEXT:    vand.vx v1, v9, s7
+; RV32ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV32ZVBC32-NEXT:    vxor.vv v2, v2, v1
+; RV32ZVBC32-NEXT:    vlse64.v v1, (ra), zero
+; RV32ZVBC32-NEXT:    vand.vv v12, v9, v12
+; RV32ZVBC32-NEXT:    vand.vv v13, v9, v3
+; RV32ZVBC32-NEXT:    vand.vv v14, v9, v10
+; RV32ZVBC32-NEXT:    vand.vv v15, v9, v15
+; RV32ZVBC32-NEXT:    vand.vv v16, v9, v16
+; RV32ZVBC32-NEXT:    vand.vv v17, v9, v17
+; RV32ZVBC32-NEXT:    vand.vv v18, v9, v18
+; RV32ZVBC32-NEXT:    vand.vv v19, v9, v19
+; RV32ZVBC32-NEXT:    vand.vv v20, v9, v20
+; RV32ZVBC32-NEXT:    vand.vv v21, v9, v21
+; RV32ZVBC32-NEXT:    vand.vv v22, v9, v22
+; RV32ZVBC32-NEXT:    vand.vv v23, v9, v23
+; RV32ZVBC32-NEXT:    vand.vv v24, v9, v24
+; RV32ZVBC32-NEXT:    vand.vv v25, v9, v25
+; RV32ZVBC32-NEXT:    vand.vv v26, v9, v26
+; RV32ZVBC32-NEXT:    vand.vv v27, v9, v27
+; RV32ZVBC32-NEXT:    vand.vv v28, v9, v28
+; RV32ZVBC32-NEXT:    vand.vv v29, v9, v29
+; RV32ZVBC32-NEXT:    vand.vv v30, v9, v30
+; RV32ZVBC32-NEXT:    vand.vv v31, v9, v31
+; RV32ZVBC32-NEXT:    vand.vv v7, v9, v7
+; RV32ZVBC32-NEXT:    vand.vv v6, v9, v6
+; RV32ZVBC32-NEXT:    vand.vv v5, v9, v5
+; RV32ZVBC32-NEXT:    vand.vv v4, v9, v4
+; RV32ZVBC32-NEXT:    vand.vv v0, v9, v0
+; RV32ZVBC32-NEXT:    vlse64.v v3, (s10), zero
+; RV32ZVBC32-NEXT:    vand.vv v10, v9, v11
+; RV32ZVBC32-NEXT:    addi a0, sp, 288
+; RV32ZVBC32-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v10, v9, v10
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v10, v9, v1
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a1, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a1, a0
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v10, v9, v3
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v3, (s9), zero
+; RV32ZVBC32-NEXT:    vlse64.v v1, (s8), zero
+; RV32ZVBC32-NEXT:    vlse64.v v10, (s6), zero
+; RV32ZVBC32-NEXT:    vlse64.v v11, (s4), zero
+; RV32ZVBC32-NEXT:    vand.vv v3, v9, v3
+; RV32ZVBC32-NEXT:    vand.vv v1, v9, v1
+; RV32ZVBC32-NEXT:    vand.vv v10, v9, v10
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v11, v9, v11
+; RV32ZVBC32-NEXT:    vand.vx v9, v9, a3
+; RV32ZVBC32-NEXT:    vmul.vv v9, v8, v9
+; RV32ZVBC32-NEXT:    vxor.vv v9, v2, v9
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v12
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v13
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v14
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v15
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v17
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v18
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v19
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v20
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v21
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v22
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v23
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v25
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v26
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v27
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v28
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v29
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v30
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v31
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v7
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v6
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v5
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v4
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v0
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    addi a0, sp, 288
+; RV32ZVBC32-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a1, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a1, a0
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v3
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v1
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v11
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a1, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a1, a0
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v9, v10, v0.t
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add sp, sp, a0
+; RV32ZVBC32-NEXT:    .cfi_def_cfa sp, 352
+; RV32ZVBC32-NEXT:    lw ra, 348(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s0, 344(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s1, 340(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s2, 336(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s3, 332(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s4, 328(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s5, 324(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s6, 320(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s7, 316(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s8, 312(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s9, 308(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s10, 304(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s11, 300(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    .cfi_restore ra
+; RV32ZVBC32-NEXT:    .cfi_restore s0
+; RV32ZVBC32-NEXT:    .cfi_restore s1
+; RV32ZVBC32-NEXT:    .cfi_restore s2
+; RV32ZVBC32-NEXT:    .cfi_restore s3
+; RV32ZVBC32-NEXT:    .cfi_restore s4
+; RV32ZVBC32-NEXT:    .cfi_restore s5
+; RV32ZVBC32-NEXT:    .cfi_restore s6
+; RV32ZVBC32-NEXT:    .cfi_restore s7
+; RV32ZVBC32-NEXT:    .cfi_restore s8
+; RV32ZVBC32-NEXT:    .cfi_restore s9
+; RV32ZVBC32-NEXT:    .cfi_restore s10
+; RV32ZVBC32-NEXT:    .cfi_restore s11
+; RV32ZVBC32-NEXT:    addi sp, sp, 352
+; RV32ZVBC32-NEXT:    .cfi_def_cfa_offset 0
+; RV32ZVBC32-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmul_nxv1i64_vx_mask:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a1, zero, e64, m1, ta, mu
-; RV64ZVBC-NEXT:    vclmul.vx v8, v8, a0, v0.t
-; RV64ZVBC-NEXT:    ret
+; RV64ZVBC32-LABEL: clmul_nxv1i64_vx_mask:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    andi a1, a0, 2
+; RV64ZVBC32-NEXT:    andi a2, a0, 1
+; RV64ZVBC32-NEXT:    vsetvli a3, zero, e64, m1, ta, mu
+; RV64ZVBC32-NEXT:    vmul.vx v9, v8, a1
+; RV64ZVBC32-NEXT:    andi a1, a0, 4
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    andi a2, a0, 8
+; RV64ZVBC32-NEXT:    vxor.vv v9, v10, v9
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a1
+; RV64ZVBC32-NEXT:    andi a1, a0, 16
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    andi a2, a0, 32
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a1
+; RV64ZVBC32-NEXT:    andi a1, a0, 64
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    andi a2, a0, 128
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a1
+; RV64ZVBC32-NEXT:    andi a1, a0, 256
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    andi a2, a0, 512
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a1
+; RV64ZVBC32-NEXT:    andi a3, a0, 1024
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    li a1, 1
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a3
+; RV64ZVBC32-NEXT:    slli a2, a1, 11
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 1
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 2
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 4
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 8
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 16
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 32
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 64
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 128
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 256
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 512
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 1024
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 2048
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 4096
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 8192
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 16384
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 32768
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 65536
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 131072
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 262144
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    srliw a2, a0, 31
+; RV64ZVBC32-NEXT:    slli a2, a2, 31
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 32
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 33
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 34
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 35
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 36
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 37
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 38
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 39
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 40
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 41
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 42
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 43
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 44
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 45
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 46
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 47
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 48
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 49
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 50
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 51
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 52
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 53
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 54
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 55
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 56
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 57
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 58
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 59
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 60
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 61
+; RV64ZVBC32-NEXT:    slli a1, a1, 62
+; RV64ZVBC32-NEXT:    and a2, a0, a2
+; RV64ZVBC32-NEXT:    and a1, a0, a1
+; RV64ZVBC32-NEXT:    srli a0, a0, 63
+; RV64ZVBC32-NEXT:    slli a0, a0, 63
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a1
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, a0
+; RV64ZVBC32-NEXT:    vxor.vv v8, v9, v10, v0.t
+; RV64ZVBC32-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x i64> poison, i64 %b, i32 0
   %vb = shufflevector <vscale x 1 x i64> %elt.head, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
   %v = call <vscale x 1 x i64> @llvm.clmul.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %vb)
   %w = select <vscale x 1 x i1> %mask, <vscale x 1 x i64> %v, <vscale x 1 x i64> %va
   ret <vscale x 1 x i64> %w
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; RV32ZVBC: {{.*}}
+; RV64ZVBC: {{.*}}
diff --git a/llvm/test/CodeGen/RISCV/rvv/clmulh-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/clmulh-sdnode.ll
index e8d33a5b10bd7..e7492a670e3af 100644
--- a/llvm/test/CodeGen/RISCV/rvv/clmulh-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/clmulh-sdnode.ll
@@ -1,8 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=riscv32 -verify-machineinstrs -mattr=+v < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32V
 ; RUN: llc -mtriple=riscv64 -verify-machineinstrs -mattr=+v < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64V
-; RUN: llc -mtriple=riscv32 -verify-machineinstrs -mattr=+v,+zvbc < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32ZVBC
-; RUN: llc -mtriple=riscv64 -verify-machineinstrs -mattr=+v,+zvbc < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64ZVBC
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs -mattr=+v,+zvbc < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32ZVBC,RV32ZVBC64
+; RUN: llc -mtriple=riscv64 -verify-machineinstrs -mattr=+v,+zvbc < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64ZVBC,RV64ZVBC64
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs -mattr=+v,+experimental-zvbc32e < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32ZVBC,RV32ZVBC32
+; RUN: llc -mtriple=riscv64 -verify-machineinstrs -mattr=+v,+experimental-zvbc32e < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64ZVBC,RV64ZVBC32
 
 define <vscale x 1 x i8> @clmulh_nxv1i8_vv(<vscale x 1 x i8> %va, <vscale x 1 x i8> %vb) nounwind {
 ; RV32V-LABEL: clmulh_nxv1i8_vv:
@@ -77,33 +79,105 @@ define <vscale x 1 x i8> @clmulh_nxv1i8_vv(<vscale x 1 x i8> %va, <vscale x 1 x
 ; RV64V-NEXT:    vnsrl.wi v8, v8, 8
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmulh_nxv1i8_vv:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV32ZVBC-NEXT:    vzext.vf8 v10, v9
-; RV32ZVBC-NEXT:    vzext.vf8 v9, v8
-; RV32ZVBC-NEXT:    vclmul.vv v8, v9, v10
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v8, v8, 0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v8, v8, 0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v8, v8, 8
-; RV32ZVBC-NEXT:    ret
+; RV32ZVBC64-LABEL: clmulh_nxv1i8_vv:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf8 v10, v9
+; RV32ZVBC64-NEXT:    vzext.vf8 v9, v8
+; RV32ZVBC64-NEXT:    vclmul.vv v8, v9, v10
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v8, 0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v8, 0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v8, 8
+; RV32ZVBC64-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmulh_nxv1i8_vv:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV64ZVBC-NEXT:    vzext.vf8 v10, v9
-; RV64ZVBC-NEXT:    vzext.vf8 v9, v8
-; RV64ZVBC-NEXT:    vclmul.vv v8, v9, v10
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v8, v8, 0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v8, v8, 0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v8, v8, 8
-; RV64ZVBC-NEXT:    ret
+; RV64ZVBC64-LABEL: clmulh_nxv1i8_vv:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf8 v10, v9
+; RV64ZVBC64-NEXT:    vzext.vf8 v9, v8
+; RV64ZVBC64-NEXT:    vclmul.vv v8, v9, v10
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v8, 0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v8, 0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v8, 8
+; RV64ZVBC64-NEXT:    ret
+;
+; RV32ZVBC32-LABEL: clmulh_nxv1i8_vv:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; RV32ZVBC32-NEXT:    vzext.vf2 v10, v8
+; RV32ZVBC32-NEXT:    vzext.vf2 v8, v9
+; RV32ZVBC32-NEXT:    li a0, 16
+; RV32ZVBC32-NEXT:    vand.vi v9, v8, 2
+; RV32ZVBC32-NEXT:    vand.vi v11, v8, 1
+; RV32ZVBC32-NEXT:    vmul.vv v9, v10, v9
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v11, v9
+; RV32ZVBC32-NEXT:    vand.vi v11, v8, 4
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vand.vi v11, v8, 8
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 32
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 64
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 128
+; RV32ZVBC32-NEXT:    vand.vx v8, v8, a0
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    vxor.vv v8, v9, v8
+; RV32ZVBC32-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
+; RV32ZVBC32-NEXT:    vnsrl.wi v8, v8, 8
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmulh_nxv1i8_vv:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; RV64ZVBC32-NEXT:    vzext.vf2 v10, v8
+; RV64ZVBC32-NEXT:    vzext.vf2 v8, v9
+; RV64ZVBC32-NEXT:    li a0, 16
+; RV64ZVBC32-NEXT:    vand.vi v9, v8, 2
+; RV64ZVBC32-NEXT:    vand.vi v11, v8, 1
+; RV64ZVBC32-NEXT:    vmul.vv v9, v10, v9
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v11, v9
+; RV64ZVBC32-NEXT:    vand.vi v11, v8, 4
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vi v11, v8, 8
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV64ZVBC32-NEXT:    li a0, 32
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV64ZVBC32-NEXT:    li a0, 64
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV64ZVBC32-NEXT:    li a0, 128
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, a0
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    vxor.vv v8, v9, v8
+; RV64ZVBC32-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
+; RV64ZVBC32-NEXT:    vnsrl.wi v8, v8, 8
+; RV64ZVBC32-NEXT:    ret
   %va.ext = zext <vscale x 1 x i8> %va to <vscale x 1 x i16>
   %vb.ext = zext <vscale x 1 x i8> %vb to <vscale x 1 x i16>
   %clmul = call <vscale x 1 x i16> @llvm.clmul.nxv1i16(<vscale x 1 x i16> %va.ext, <vscale x 1 x i16> %vb.ext)
@@ -189,37 +263,113 @@ define <vscale x 1 x i8> @clmulh_nxv1i8_vx(<vscale x 1 x i8> %va, i8 %b) nounwin
 ; RV64V-NEXT:    vnsrl.wi v8, v8, 8
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmulh_nxv1i8_vx:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
-; RV32ZVBC-NEXT:    vmv.v.x v9, a0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
-; RV32ZVBC-NEXT:    vzext.vf8 v10, v8
-; RV32ZVBC-NEXT:    vzext.vf8 v8, v9
-; RV32ZVBC-NEXT:    vclmul.vv v8, v10, v8
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v8, v8, 0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v8, v8, 0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v8, v8, 8
-; RV32ZVBC-NEXT:    ret
+; RV32ZVBC64-LABEL: clmulh_nxv1i8_vx:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
+; RV32ZVBC64-NEXT:    vmv.v.x v9, a0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf8 v10, v8
+; RV32ZVBC64-NEXT:    vzext.vf8 v8, v9
+; RV32ZVBC64-NEXT:    vclmul.vv v8, v10, v8
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v8, 0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v8, 0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v8, 8
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmulh_nxv1i8_vx:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
+; RV64ZVBC64-NEXT:    vmv.v.x v9, a0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf8 v10, v8
+; RV64ZVBC64-NEXT:    vzext.vf8 v8, v9
+; RV64ZVBC64-NEXT:    vclmul.vv v8, v10, v8
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v8, 0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v8, 0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v8, 8
+; RV64ZVBC64-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmulh_nxv1i8_vx:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
-; RV64ZVBC-NEXT:    vmv.v.x v9, a0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
-; RV64ZVBC-NEXT:    vzext.vf8 v10, v8
-; RV64ZVBC-NEXT:    vzext.vf8 v8, v9
-; RV64ZVBC-NEXT:    vclmul.vv v8, v10, v8
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v8, v8, 0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v8, v8, 0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v8, v8, 8
-; RV64ZVBC-NEXT:    ret
+; RV32ZVBC32-LABEL: clmulh_nxv1i8_vx:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
+; RV32ZVBC32-NEXT:    vmv.v.x v9, a0
+; RV32ZVBC32-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; RV32ZVBC32-NEXT:    vzext.vf2 v10, v8
+; RV32ZVBC32-NEXT:    li a0, 16
+; RV32ZVBC32-NEXT:    vzext.vf2 v8, v9
+; RV32ZVBC32-NEXT:    vand.vi v9, v8, 2
+; RV32ZVBC32-NEXT:    vand.vi v11, v8, 1
+; RV32ZVBC32-NEXT:    vmul.vv v9, v10, v9
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v11, v9
+; RV32ZVBC32-NEXT:    vand.vi v11, v8, 4
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vand.vi v11, v8, 8
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 32
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 64
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 128
+; RV32ZVBC32-NEXT:    vand.vx v8, v8, a0
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    vxor.vv v8, v9, v8
+; RV32ZVBC32-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
+; RV32ZVBC32-NEXT:    vnsrl.wi v8, v8, 8
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmulh_nxv1i8_vx:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
+; RV64ZVBC32-NEXT:    vmv.v.x v9, a0
+; RV64ZVBC32-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; RV64ZVBC32-NEXT:    vzext.vf2 v10, v8
+; RV64ZVBC32-NEXT:    li a0, 16
+; RV64ZVBC32-NEXT:    vzext.vf2 v8, v9
+; RV64ZVBC32-NEXT:    vand.vi v9, v8, 2
+; RV64ZVBC32-NEXT:    vand.vi v11, v8, 1
+; RV64ZVBC32-NEXT:    vmul.vv v9, v10, v9
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v11, v9
+; RV64ZVBC32-NEXT:    vand.vi v11, v8, 4
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vi v11, v8, 8
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV64ZVBC32-NEXT:    li a0, 32
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV64ZVBC32-NEXT:    li a0, 64
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV64ZVBC32-NEXT:    li a0, 128
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, a0
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    vxor.vv v8, v9, v8
+; RV64ZVBC32-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
+; RV64ZVBC32-NEXT:    vnsrl.wi v8, v8, 8
+; RV64ZVBC32-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x i8> poison, i8 %b, i32 0
   %vb = shufflevector <vscale x 1 x i8> %elt.head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
   %va.ext = zext <vscale x 1 x i8> %va to <vscale x 1 x i16>
@@ -303,33 +453,105 @@ define <vscale x 2 x i8> @clmulh_nxv2i8_vv(<vscale x 2 x i8> %va, <vscale x 2 x
 ; RV64V-NEXT:    vnsrl.wi v8, v8, 8
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmulh_nxv2i8_vv:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV32ZVBC-NEXT:    vzext.vf8 v10, v9
-; RV32ZVBC-NEXT:    vzext.vf8 v12, v8
-; RV32ZVBC-NEXT:    vclmul.vv v8, v12, v10
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v10, v8, 0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v8, v10, 0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v8, v8, 8
-; RV32ZVBC-NEXT:    ret
+; RV32ZVBC64-LABEL: clmulh_nxv2i8_vv:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf8 v10, v9
+; RV32ZVBC64-NEXT:    vzext.vf8 v12, v8
+; RV32ZVBC64-NEXT:    vclmul.vv v8, v12, v10
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v10, v8, 0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v10, 0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v8, 8
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmulh_nxv2i8_vv:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf8 v10, v9
+; RV64ZVBC64-NEXT:    vzext.vf8 v12, v8
+; RV64ZVBC64-NEXT:    vclmul.vv v8, v12, v10
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v10, v8, 0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v10, 0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v8, 8
+; RV64ZVBC64-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmulh_nxv2i8_vv:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV64ZVBC-NEXT:    vzext.vf8 v10, v9
-; RV64ZVBC-NEXT:    vzext.vf8 v12, v8
-; RV64ZVBC-NEXT:    vclmul.vv v8, v12, v10
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v10, v8, 0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v8, v10, 0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v8, v8, 8
-; RV64ZVBC-NEXT:    ret
+; RV32ZVBC32-LABEL: clmulh_nxv2i8_vv:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; RV32ZVBC32-NEXT:    vzext.vf2 v10, v8
+; RV32ZVBC32-NEXT:    vzext.vf2 v8, v9
+; RV32ZVBC32-NEXT:    li a0, 16
+; RV32ZVBC32-NEXT:    vand.vi v9, v8, 2
+; RV32ZVBC32-NEXT:    vand.vi v11, v8, 1
+; RV32ZVBC32-NEXT:    vmul.vv v9, v10, v9
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v11, v9
+; RV32ZVBC32-NEXT:    vand.vi v11, v8, 4
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vand.vi v11, v8, 8
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 32
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 64
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 128
+; RV32ZVBC32-NEXT:    vand.vx v8, v8, a0
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    vxor.vv v8, v9, v8
+; RV32ZVBC32-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; RV32ZVBC32-NEXT:    vnsrl.wi v8, v8, 8
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmulh_nxv2i8_vv:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; RV64ZVBC32-NEXT:    vzext.vf2 v10, v8
+; RV64ZVBC32-NEXT:    vzext.vf2 v8, v9
+; RV64ZVBC32-NEXT:    li a0, 16
+; RV64ZVBC32-NEXT:    vand.vi v9, v8, 2
+; RV64ZVBC32-NEXT:    vand.vi v11, v8, 1
+; RV64ZVBC32-NEXT:    vmul.vv v9, v10, v9
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v11, v9
+; RV64ZVBC32-NEXT:    vand.vi v11, v8, 4
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vi v11, v8, 8
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV64ZVBC32-NEXT:    li a0, 32
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV64ZVBC32-NEXT:    li a0, 64
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV64ZVBC32-NEXT:    li a0, 128
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, a0
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    vxor.vv v8, v9, v8
+; RV64ZVBC32-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; RV64ZVBC32-NEXT:    vnsrl.wi v8, v8, 8
+; RV64ZVBC32-NEXT:    ret
   %va.ext = zext <vscale x 2 x i8> %va to <vscale x 2 x i16>
   %vb.ext = zext <vscale x 2 x i8> %vb to <vscale x 2 x i16>
   %clmul = call <vscale x 2 x i16> @llvm.clmul.nxv2i16(<vscale x 2 x i16> %va.ext, <vscale x 2 x i16> %vb.ext)
@@ -415,37 +637,113 @@ define <vscale x 2 x i8> @clmulh_nxv2i8_vx(<vscale x 2 x i8> %va, i8 %b) nounwin
 ; RV64V-NEXT:    vnsrl.wi v8, v8, 8
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmulh_nxv2i8_vx:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
-; RV32ZVBC-NEXT:    vmv.v.x v12, a0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; RV32ZVBC-NEXT:    vzext.vf8 v10, v8
-; RV32ZVBC-NEXT:    vzext.vf8 v8, v12
-; RV32ZVBC-NEXT:    vclmul.vv v8, v10, v8
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v10, v8, 0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v8, v10, 0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v8, v8, 8
-; RV32ZVBC-NEXT:    ret
+; RV32ZVBC64-LABEL: clmulh_nxv2i8_vx:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
+; RV32ZVBC64-NEXT:    vmv.v.x v12, a0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf8 v10, v8
+; RV32ZVBC64-NEXT:    vzext.vf8 v8, v12
+; RV32ZVBC64-NEXT:    vclmul.vv v8, v10, v8
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v10, v8, 0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v10, 0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v8, 8
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmulh_nxv2i8_vx:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
+; RV64ZVBC64-NEXT:    vmv.v.x v12, a0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf8 v10, v8
+; RV64ZVBC64-NEXT:    vzext.vf8 v8, v12
+; RV64ZVBC64-NEXT:    vclmul.vv v8, v10, v8
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v10, v8, 0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v10, 0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v8, 8
+; RV64ZVBC64-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmulh_nxv2i8_vx:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
-; RV64ZVBC-NEXT:    vmv.v.x v12, a0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; RV64ZVBC-NEXT:    vzext.vf8 v10, v8
-; RV64ZVBC-NEXT:    vzext.vf8 v8, v12
-; RV64ZVBC-NEXT:    vclmul.vv v8, v10, v8
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v10, v8, 0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v8, v10, 0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v8, v8, 8
-; RV64ZVBC-NEXT:    ret
+; RV32ZVBC32-LABEL: clmulh_nxv2i8_vx:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
+; RV32ZVBC32-NEXT:    vmv.v.x v9, a0
+; RV32ZVBC32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; RV32ZVBC32-NEXT:    vzext.vf2 v10, v8
+; RV32ZVBC32-NEXT:    li a0, 16
+; RV32ZVBC32-NEXT:    vzext.vf2 v8, v9
+; RV32ZVBC32-NEXT:    vand.vi v9, v8, 2
+; RV32ZVBC32-NEXT:    vand.vi v11, v8, 1
+; RV32ZVBC32-NEXT:    vmul.vv v9, v10, v9
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v11, v9
+; RV32ZVBC32-NEXT:    vand.vi v11, v8, 4
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vand.vi v11, v8, 8
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 32
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 64
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 128
+; RV32ZVBC32-NEXT:    vand.vx v8, v8, a0
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    vxor.vv v8, v9, v8
+; RV32ZVBC32-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; RV32ZVBC32-NEXT:    vnsrl.wi v8, v8, 8
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmulh_nxv2i8_vx:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
+; RV64ZVBC32-NEXT:    vmv.v.x v9, a0
+; RV64ZVBC32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; RV64ZVBC32-NEXT:    vzext.vf2 v10, v8
+; RV64ZVBC32-NEXT:    li a0, 16
+; RV64ZVBC32-NEXT:    vzext.vf2 v8, v9
+; RV64ZVBC32-NEXT:    vand.vi v9, v8, 2
+; RV64ZVBC32-NEXT:    vand.vi v11, v8, 1
+; RV64ZVBC32-NEXT:    vmul.vv v9, v10, v9
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v11, v9
+; RV64ZVBC32-NEXT:    vand.vi v11, v8, 4
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vi v11, v8, 8
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV64ZVBC32-NEXT:    li a0, 32
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV64ZVBC32-NEXT:    li a0, 64
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV64ZVBC32-NEXT:    li a0, 128
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, a0
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    vxor.vv v8, v9, v8
+; RV64ZVBC32-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; RV64ZVBC32-NEXT:    vnsrl.wi v8, v8, 8
+; RV64ZVBC32-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x i8> poison, i8 %b, i32 0
   %vb = shufflevector <vscale x 2 x i8> %elt.head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
   %va.ext = zext <vscale x 2 x i8> %va to <vscale x 2 x i16>
@@ -529,33 +827,105 @@ define <vscale x 4 x i8> @clmulh_nxv4i8_vv(<vscale x 4 x i8> %va, <vscale x 4 x
 ; RV64V-NEXT:    vnsrl.wi v8, v8, 8
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmulh_nxv4i8_vv:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV32ZVBC-NEXT:    vzext.vf8 v12, v9
-; RV32ZVBC-NEXT:    vzext.vf8 v16, v8
-; RV32ZVBC-NEXT:    vclmul.vv v8, v16, v12
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v12, v8, 0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v8, v12, 0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v8, v8, 8
-; RV32ZVBC-NEXT:    ret
+; RV32ZVBC64-LABEL: clmulh_nxv4i8_vv:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf8 v12, v9
+; RV32ZVBC64-NEXT:    vzext.vf8 v16, v8
+; RV32ZVBC64-NEXT:    vclmul.vv v8, v16, v12
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v12, v8, 0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v12, 0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v8, 8
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmulh_nxv4i8_vv:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf8 v12, v9
+; RV64ZVBC64-NEXT:    vzext.vf8 v16, v8
+; RV64ZVBC64-NEXT:    vclmul.vv v8, v16, v12
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v12, v8, 0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v12, 0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v8, 8
+; RV64ZVBC64-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmulh_nxv4i8_vv:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV64ZVBC-NEXT:    vzext.vf8 v12, v9
-; RV64ZVBC-NEXT:    vzext.vf8 v16, v8
-; RV64ZVBC-NEXT:    vclmul.vv v8, v16, v12
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v12, v8, 0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v8, v12, 0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v8, v8, 8
-; RV64ZVBC-NEXT:    ret
+; RV32ZVBC32-LABEL: clmulh_nxv4i8_vv:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; RV32ZVBC32-NEXT:    vzext.vf2 v10, v8
+; RV32ZVBC32-NEXT:    vzext.vf2 v8, v9
+; RV32ZVBC32-NEXT:    li a0, 16
+; RV32ZVBC32-NEXT:    vand.vi v9, v8, 2
+; RV32ZVBC32-NEXT:    vand.vi v11, v8, 1
+; RV32ZVBC32-NEXT:    vmul.vv v9, v10, v9
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v11, v9
+; RV32ZVBC32-NEXT:    vand.vi v11, v8, 4
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vand.vi v11, v8, 8
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 32
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 64
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 128
+; RV32ZVBC32-NEXT:    vand.vx v8, v8, a0
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    vxor.vv v8, v9, v8
+; RV32ZVBC32-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; RV32ZVBC32-NEXT:    vnsrl.wi v8, v8, 8
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmulh_nxv4i8_vv:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; RV64ZVBC32-NEXT:    vzext.vf2 v10, v8
+; RV64ZVBC32-NEXT:    vzext.vf2 v8, v9
+; RV64ZVBC32-NEXT:    li a0, 16
+; RV64ZVBC32-NEXT:    vand.vi v9, v8, 2
+; RV64ZVBC32-NEXT:    vand.vi v11, v8, 1
+; RV64ZVBC32-NEXT:    vmul.vv v9, v10, v9
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v11, v9
+; RV64ZVBC32-NEXT:    vand.vi v11, v8, 4
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vi v11, v8, 8
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV64ZVBC32-NEXT:    li a0, 32
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV64ZVBC32-NEXT:    li a0, 64
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV64ZVBC32-NEXT:    li a0, 128
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, a0
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    vxor.vv v8, v9, v8
+; RV64ZVBC32-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; RV64ZVBC32-NEXT:    vnsrl.wi v8, v8, 8
+; RV64ZVBC32-NEXT:    ret
   %va.ext = zext <vscale x 4 x i8> %va to <vscale x 4 x i16>
   %vb.ext = zext <vscale x 4 x i8> %vb to <vscale x 4 x i16>
   %clmul = call <vscale x 4 x i16> @llvm.clmul.nxv4i16(<vscale x 4 x i16> %va.ext, <vscale x 4 x i16> %vb.ext)
@@ -641,37 +1011,113 @@ define <vscale x 4 x i8> @clmulh_nxv4i8_vx(<vscale x 4 x i8> %va, i8 %b) nounwin
 ; RV64V-NEXT:    vnsrl.wi v8, v8, 8
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmulh_nxv4i8_vx:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
-; RV32ZVBC-NEXT:    vmv.v.x v16, a0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
-; RV32ZVBC-NEXT:    vzext.vf8 v12, v8
-; RV32ZVBC-NEXT:    vzext.vf8 v8, v16
-; RV32ZVBC-NEXT:    vclmul.vv v8, v12, v8
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v12, v8, 0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v8, v12, 0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v8, v8, 8
-; RV32ZVBC-NEXT:    ret
+; RV32ZVBC64-LABEL: clmulh_nxv4i8_vx:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; RV32ZVBC64-NEXT:    vmv.v.x v16, a0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf8 v12, v8
+; RV32ZVBC64-NEXT:    vzext.vf8 v8, v16
+; RV32ZVBC64-NEXT:    vclmul.vv v8, v12, v8
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v12, v8, 0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v12, 0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v8, 8
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmulh_nxv4i8_vx:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; RV64ZVBC64-NEXT:    vmv.v.x v16, a0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf8 v12, v8
+; RV64ZVBC64-NEXT:    vzext.vf8 v8, v16
+; RV64ZVBC64-NEXT:    vclmul.vv v8, v12, v8
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v12, v8, 0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v12, 0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v8, 8
+; RV64ZVBC64-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmulh_nxv4i8_vx:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
-; RV64ZVBC-NEXT:    vmv.v.x v16, a0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
-; RV64ZVBC-NEXT:    vzext.vf8 v12, v8
-; RV64ZVBC-NEXT:    vzext.vf8 v8, v16
-; RV64ZVBC-NEXT:    vclmul.vv v8, v12, v8
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v12, v8, 0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v8, v12, 0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v8, v8, 8
-; RV64ZVBC-NEXT:    ret
+; RV32ZVBC32-LABEL: clmulh_nxv4i8_vx:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; RV32ZVBC32-NEXT:    vmv.v.x v9, a0
+; RV32ZVBC32-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; RV32ZVBC32-NEXT:    vzext.vf2 v10, v8
+; RV32ZVBC32-NEXT:    li a0, 16
+; RV32ZVBC32-NEXT:    vzext.vf2 v8, v9
+; RV32ZVBC32-NEXT:    vand.vi v9, v8, 2
+; RV32ZVBC32-NEXT:    vand.vi v11, v8, 1
+; RV32ZVBC32-NEXT:    vmul.vv v9, v10, v9
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v11, v9
+; RV32ZVBC32-NEXT:    vand.vi v11, v8, 4
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vand.vi v11, v8, 8
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 32
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 64
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 128
+; RV32ZVBC32-NEXT:    vand.vx v8, v8, a0
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    vxor.vv v8, v9, v8
+; RV32ZVBC32-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; RV32ZVBC32-NEXT:    vnsrl.wi v8, v8, 8
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmulh_nxv4i8_vx:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; RV64ZVBC32-NEXT:    vmv.v.x v9, a0
+; RV64ZVBC32-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; RV64ZVBC32-NEXT:    vzext.vf2 v10, v8
+; RV64ZVBC32-NEXT:    li a0, 16
+; RV64ZVBC32-NEXT:    vzext.vf2 v8, v9
+; RV64ZVBC32-NEXT:    vand.vi v9, v8, 2
+; RV64ZVBC32-NEXT:    vand.vi v11, v8, 1
+; RV64ZVBC32-NEXT:    vmul.vv v9, v10, v9
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v11, v9
+; RV64ZVBC32-NEXT:    vand.vi v11, v8, 4
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vi v11, v8, 8
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV64ZVBC32-NEXT:    li a0, 32
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV64ZVBC32-NEXT:    li a0, 64
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV64ZVBC32-NEXT:    li a0, 128
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, a0
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    vxor.vv v8, v9, v8
+; RV64ZVBC32-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; RV64ZVBC32-NEXT:    vnsrl.wi v8, v8, 8
+; RV64ZVBC32-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x i8> poison, i8 %b, i32 0
   %vb = shufflevector <vscale x 4 x i8> %elt.head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
   %va.ext = zext <vscale x 4 x i8> %va to <vscale x 4 x i16>
@@ -755,33 +1201,105 @@ define <vscale x 8 x i8> @clmulh_nxv8i8_vv(<vscale x 8 x i8> %va, <vscale x 8 x
 ; RV64V-NEXT:    vnsrl.wi v8, v10, 8
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmulh_nxv8i8_vv:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV32ZVBC-NEXT:    vzext.vf8 v16, v9
-; RV32ZVBC-NEXT:    vzext.vf8 v24, v8
-; RV32ZVBC-NEXT:    vclmul.vv v8, v24, v16
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v16, v8, 0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v10, v16, 0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v8, v10, 8
-; RV32ZVBC-NEXT:    ret
+; RV32ZVBC64-LABEL: clmulh_nxv8i8_vv:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf8 v16, v9
+; RV32ZVBC64-NEXT:    vzext.vf8 v24, v8
+; RV32ZVBC64-NEXT:    vclmul.vv v8, v24, v16
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v16, v8, 0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v10, v16, 0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v10, 8
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmulh_nxv8i8_vv:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf8 v16, v9
+; RV64ZVBC64-NEXT:    vzext.vf8 v24, v8
+; RV64ZVBC64-NEXT:    vclmul.vv v8, v24, v16
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v16, v8, 0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v10, v16, 0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v10, 8
+; RV64ZVBC64-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmulh_nxv8i8_vv:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV64ZVBC-NEXT:    vzext.vf8 v16, v9
-; RV64ZVBC-NEXT:    vzext.vf8 v24, v8
-; RV64ZVBC-NEXT:    vclmul.vv v8, v24, v16
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v16, v8, 0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v10, v16, 0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v8, v10, 8
-; RV64ZVBC-NEXT:    ret
+; RV32ZVBC32-LABEL: clmulh_nxv8i8_vv:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; RV32ZVBC32-NEXT:    vzext.vf2 v10, v8
+; RV32ZVBC32-NEXT:    vzext.vf2 v12, v9
+; RV32ZVBC32-NEXT:    li a0, 16
+; RV32ZVBC32-NEXT:    vand.vi v8, v12, 2
+; RV32ZVBC32-NEXT:    vand.vi v14, v12, 1
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV32ZVBC32-NEXT:    vxor.vv v8, v14, v8
+; RV32ZVBC32-NEXT:    vand.vi v14, v12, 4
+; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV32ZVBC32-NEXT:    vand.vi v14, v12, 8
+; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v12, a0
+; RV32ZVBC32-NEXT:    li a0, 32
+; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v12, a0
+; RV32ZVBC32-NEXT:    li a0, 64
+; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v12, a0
+; RV32ZVBC32-NEXT:    li a0, 128
+; RV32ZVBC32-NEXT:    vand.vx v12, v12, a0
+; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV32ZVBC32-NEXT:    vmul.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    vxor.vv v10, v8, v10
+; RV32ZVBC32-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
+; RV32ZVBC32-NEXT:    vnsrl.wi v8, v10, 8
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmulh_nxv8i8_vv:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; RV64ZVBC32-NEXT:    vzext.vf2 v10, v8
+; RV64ZVBC32-NEXT:    vzext.vf2 v12, v9
+; RV64ZVBC32-NEXT:    li a0, 16
+; RV64ZVBC32-NEXT:    vand.vi v8, v12, 2
+; RV64ZVBC32-NEXT:    vand.vi v14, v12, 1
+; RV64ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v8, v14, v8
+; RV64ZVBC32-NEXT:    vand.vi v14, v12, 4
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV64ZVBC32-NEXT:    vand.vi v14, v12, 8
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v12, a0
+; RV64ZVBC32-NEXT:    li a0, 32
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v12, a0
+; RV64ZVBC32-NEXT:    li a0, 64
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v12, a0
+; RV64ZVBC32-NEXT:    li a0, 128
+; RV64ZVBC32-NEXT:    vand.vx v12, v12, a0
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV64ZVBC32-NEXT:    vmul.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vxor.vv v10, v8, v10
+; RV64ZVBC32-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
+; RV64ZVBC32-NEXT:    vnsrl.wi v8, v10, 8
+; RV64ZVBC32-NEXT:    ret
   %va.ext = zext <vscale x 8 x i8> %va to <vscale x 8 x i16>
   %vb.ext = zext <vscale x 8 x i8> %vb to <vscale x 8 x i16>
   %clmul = call <vscale x 8 x i16> @llvm.clmul.nxv8i16(<vscale x 8 x i16> %va.ext, <vscale x 8 x i16> %vb.ext)
@@ -867,37 +1385,113 @@ define <vscale x 8 x i8> @clmulh_nxv8i8_vx(<vscale x 8 x i8> %va, i8 %b) nounwin
 ; RV64V-NEXT:    vnsrl.wi v8, v10, 8
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmulh_nxv8i8_vx:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
-; RV32ZVBC-NEXT:    vmv.v.x v24, a0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV32ZVBC-NEXT:    vzext.vf8 v16, v8
-; RV32ZVBC-NEXT:    vzext.vf8 v8, v24
-; RV32ZVBC-NEXT:    vclmul.vv v8, v16, v8
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v16, v8, 0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v10, v16, 0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v8, v10, 8
-; RV32ZVBC-NEXT:    ret
+; RV32ZVBC64-LABEL: clmulh_nxv8i8_vx:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
+; RV32ZVBC64-NEXT:    vmv.v.x v24, a0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf8 v16, v8
+; RV32ZVBC64-NEXT:    vzext.vf8 v8, v24
+; RV32ZVBC64-NEXT:    vclmul.vv v8, v16, v8
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v16, v8, 0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v10, v16, 0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v10, 8
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmulh_nxv8i8_vx:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
+; RV64ZVBC64-NEXT:    vmv.v.x v24, a0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf8 v16, v8
+; RV64ZVBC64-NEXT:    vzext.vf8 v8, v24
+; RV64ZVBC64-NEXT:    vclmul.vv v8, v16, v8
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v16, v8, 0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v10, v16, 0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v10, 8
+; RV64ZVBC64-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmulh_nxv8i8_vx:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
-; RV64ZVBC-NEXT:    vmv.v.x v24, a0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV64ZVBC-NEXT:    vzext.vf8 v16, v8
-; RV64ZVBC-NEXT:    vzext.vf8 v8, v24
-; RV64ZVBC-NEXT:    vclmul.vv v8, v16, v8
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v16, v8, 0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v10, v16, 0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v8, v10, 8
-; RV64ZVBC-NEXT:    ret
+; RV32ZVBC32-LABEL: clmulh_nxv8i8_vx:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
+; RV32ZVBC32-NEXT:    vmv.v.x v12, a0
+; RV32ZVBC32-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV32ZVBC32-NEXT:    vzext.vf2 v10, v8
+; RV32ZVBC32-NEXT:    li a0, 16
+; RV32ZVBC32-NEXT:    vzext.vf2 v8, v12
+; RV32ZVBC32-NEXT:    vand.vi v12, v8, 2
+; RV32ZVBC32-NEXT:    vand.vi v14, v8, 1
+; RV32ZVBC32-NEXT:    vmul.vv v12, v10, v12
+; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v14, v12
+; RV32ZVBC32-NEXT:    vand.vi v14, v8, 4
+; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vi v14, v8, 8
+; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 32
+; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 64
+; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 128
+; RV32ZVBC32-NEXT:    vand.vx v8, v8, a0
+; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    vxor.vv v10, v12, v8
+; RV32ZVBC32-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
+; RV32ZVBC32-NEXT:    vnsrl.wi v8, v10, 8
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmulh_nxv8i8_vx:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
+; RV64ZVBC32-NEXT:    vmv.v.x v12, a0
+; RV64ZVBC32-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV64ZVBC32-NEXT:    vzext.vf2 v10, v8
+; RV64ZVBC32-NEXT:    li a0, 16
+; RV64ZVBC32-NEXT:    vzext.vf2 v8, v12
+; RV64ZVBC32-NEXT:    vand.vi v12, v8, 2
+; RV64ZVBC32-NEXT:    vand.vi v14, v8, 1
+; RV64ZVBC32-NEXT:    vmul.vv v12, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v14, v12
+; RV64ZVBC32-NEXT:    vand.vi v14, v8, 4
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vi v14, v8, 8
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v8, a0
+; RV64ZVBC32-NEXT:    li a0, 32
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v8, a0
+; RV64ZVBC32-NEXT:    li a0, 64
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v8, a0
+; RV64ZVBC32-NEXT:    li a0, 128
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, a0
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    vxor.vv v10, v12, v8
+; RV64ZVBC32-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
+; RV64ZVBC32-NEXT:    vnsrl.wi v8, v10, 8
+; RV64ZVBC32-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x i8> poison, i8 %b, i32 0
   %vb = shufflevector <vscale x 8 x i8> %elt.head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
   %va.ext = zext <vscale x 8 x i8> %va to <vscale x 8 x i16>
@@ -1734,29 +2328,167 @@ define <vscale x 1 x i16> @clmulh_nxv1i16_vv(<vscale x 1 x i16> %va, <vscale x 1
 ; RV64V-NEXT:    vnsrl.wi v8, v8, 16
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmulh_nxv1i16_vv:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV32ZVBC-NEXT:    vzext.vf4 v10, v9
-; RV32ZVBC-NEXT:    vzext.vf4 v9, v8
-; RV32ZVBC-NEXT:    vclmul.vv v8, v9, v10
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v8, v8, 0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v8, v8, 16
-; RV32ZVBC-NEXT:    ret
+; RV32ZVBC64-LABEL: clmulh_nxv1i16_vv:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf4 v10, v9
+; RV32ZVBC64-NEXT:    vzext.vf4 v9, v8
+; RV32ZVBC64-NEXT:    vclmul.vv v8, v9, v10
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v8, 0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v8, 16
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmulh_nxv1i16_vv:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf4 v10, v9
+; RV64ZVBC64-NEXT:    vzext.vf4 v9, v8
+; RV64ZVBC64-NEXT:    vclmul.vv v8, v9, v10
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v8, 0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v8, 16
+; RV64ZVBC64-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmulh_nxv1i16_vv:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV64ZVBC-NEXT:    vzext.vf4 v10, v9
-; RV64ZVBC-NEXT:    vzext.vf4 v9, v8
-; RV64ZVBC-NEXT:    vclmul.vv v8, v9, v10
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v8, v8, 0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v8, v8, 16
-; RV64ZVBC-NEXT:    ret
+; RV32ZVBC32-LABEL: clmulh_nxv1i16_vv:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; RV32ZVBC32-NEXT:    vzext.vf2 v10, v8
+; RV32ZVBC32-NEXT:    vzext.vf2 v8, v9
+; RV32ZVBC32-NEXT:    li a0, 16
+; RV32ZVBC32-NEXT:    vand.vi v9, v8, 2
+; RV32ZVBC32-NEXT:    vand.vi v11, v8, 1
+; RV32ZVBC32-NEXT:    vmul.vv v9, v10, v9
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v11, v9
+; RV32ZVBC32-NEXT:    vand.vi v11, v8, 4
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vand.vi v11, v8, 8
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 32
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 64
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 128
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 256
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 512
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 1024
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 1
+; RV32ZVBC32-NEXT:    slli a0, a0, 11
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV32ZVBC32-NEXT:    lui a0, 1
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV32ZVBC32-NEXT:    lui a0, 2
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV32ZVBC32-NEXT:    lui a0, 4
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV32ZVBC32-NEXT:    lui a0, 8
+; RV32ZVBC32-NEXT:    vand.vx v8, v8, a0
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    vxor.vv v8, v9, v8
+; RV32ZVBC32-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; RV32ZVBC32-NEXT:    vnsrl.wi v8, v8, 16
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmulh_nxv1i16_vv:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; RV64ZVBC32-NEXT:    vzext.vf2 v10, v8
+; RV64ZVBC32-NEXT:    vzext.vf2 v8, v9
+; RV64ZVBC32-NEXT:    li a0, 16
+; RV64ZVBC32-NEXT:    vand.vi v9, v8, 2
+; RV64ZVBC32-NEXT:    vand.vi v11, v8, 1
+; RV64ZVBC32-NEXT:    vmul.vv v9, v10, v9
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v11, v9
+; RV64ZVBC32-NEXT:    vand.vi v11, v8, 4
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vi v11, v8, 8
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV64ZVBC32-NEXT:    li a0, 32
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV64ZVBC32-NEXT:    li a0, 64
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV64ZVBC32-NEXT:    li a0, 128
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV64ZVBC32-NEXT:    li a0, 256
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV64ZVBC32-NEXT:    li a0, 512
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV64ZVBC32-NEXT:    li a0, 1024
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV64ZVBC32-NEXT:    li a0, 1
+; RV64ZVBC32-NEXT:    slli a0, a0, 11
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV64ZVBC32-NEXT:    lui a0, 1
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV64ZVBC32-NEXT:    lui a0, 2
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV64ZVBC32-NEXT:    lui a0, 4
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV64ZVBC32-NEXT:    lui a0, 8
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, a0
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    vxor.vv v8, v9, v8
+; RV64ZVBC32-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; RV64ZVBC32-NEXT:    vnsrl.wi v8, v8, 16
+; RV64ZVBC32-NEXT:    ret
   %va.ext = zext <vscale x 1 x i16> %va to <vscale x 1 x i32>
   %vb.ext = zext <vscale x 1 x i16> %vb to <vscale x 1 x i32>
   %clmul = call <vscale x 1 x i32> @llvm.clmul.nxv1i32(<vscale x 1 x i32> %va.ext, <vscale x 1 x i32> %vb.ext)
@@ -1908,33 +2640,175 @@ define <vscale x 1 x i16> @clmulh_nxv1i16_vx(<vscale x 1 x i16> %va, i16 %b) nou
 ; RV64V-NEXT:    vnsrl.wi v8, v8, 16
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmulh_nxv1i16_vx:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; RV32ZVBC-NEXT:    vmv.v.x v9, a0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
-; RV32ZVBC-NEXT:    vzext.vf4 v10, v8
-; RV32ZVBC-NEXT:    vzext.vf4 v8, v9
-; RV32ZVBC-NEXT:    vclmul.vv v8, v10, v8
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v8, v8, 0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v8, v8, 16
-; RV32ZVBC-NEXT:    ret
+; RV32ZVBC64-LABEL: clmulh_nxv1i16_vx:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; RV32ZVBC64-NEXT:    vmv.v.x v9, a0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf4 v10, v8
+; RV32ZVBC64-NEXT:    vzext.vf4 v8, v9
+; RV32ZVBC64-NEXT:    vclmul.vv v8, v10, v8
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v8, 0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v8, 16
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmulh_nxv1i16_vx:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; RV64ZVBC64-NEXT:    vmv.v.x v9, a0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf4 v10, v8
+; RV64ZVBC64-NEXT:    vzext.vf4 v8, v9
+; RV64ZVBC64-NEXT:    vclmul.vv v8, v10, v8
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v8, 0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v8, 16
+; RV64ZVBC64-NEXT:    ret
+;
+; RV32ZVBC32-LABEL: clmulh_nxv1i16_vx:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; RV32ZVBC32-NEXT:    vmv.v.x v10, a0
+; RV32ZVBC32-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV32ZVBC32-NEXT:    vzext.vf2 v9, v8
+; RV32ZVBC32-NEXT:    li a0, 16
+; RV32ZVBC32-NEXT:    vzext.vf2 v8, v10
+; RV32ZVBC32-NEXT:    vand.vi v10, v8, 2
+; RV32ZVBC32-NEXT:    vand.vi v11, v8, 1
+; RV32ZVBC32-NEXT:    vmul.vv v10, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v11, v10
+; RV32ZVBC32-NEXT:    vand.vi v11, v8, 4
+; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vi v11, v8, 8
+; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 32
+; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 64
+; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 128
+; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 256
+; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 512
+; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 1024
+; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 1
+; RV32ZVBC32-NEXT:    slli a0, a0, 11
+; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV32ZVBC32-NEXT:    lui a0, 1
+; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV32ZVBC32-NEXT:    lui a0, 2
+; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV32ZVBC32-NEXT:    lui a0, 4
+; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV32ZVBC32-NEXT:    lui a0, 8
+; RV32ZVBC32-NEXT:    vand.vx v8, v8, a0
+; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vmul.vv v8, v9, v8
+; RV32ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; RV32ZVBC32-NEXT:    vnsrl.wi v8, v8, 16
+; RV32ZVBC32-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmulh_nxv1i16_vx:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; RV64ZVBC-NEXT:    vmv.v.x v9, a0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
-; RV64ZVBC-NEXT:    vzext.vf4 v10, v8
-; RV64ZVBC-NEXT:    vzext.vf4 v8, v9
-; RV64ZVBC-NEXT:    vclmul.vv v8, v10, v8
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v8, v8, 0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v8, v8, 16
-; RV64ZVBC-NEXT:    ret
+; RV64ZVBC32-LABEL: clmulh_nxv1i16_vx:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; RV64ZVBC32-NEXT:    vmv.v.x v10, a0
+; RV64ZVBC32-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV64ZVBC32-NEXT:    vzext.vf2 v9, v8
+; RV64ZVBC32-NEXT:    li a0, 16
+; RV64ZVBC32-NEXT:    vzext.vf2 v8, v10
+; RV64ZVBC32-NEXT:    vand.vi v10, v8, 2
+; RV64ZVBC32-NEXT:    vand.vi v11, v8, 1
+; RV64ZVBC32-NEXT:    vmul.vv v10, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v11, v10
+; RV64ZVBC32-NEXT:    vand.vi v11, v8, 4
+; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vi v11, v8, 8
+; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV64ZVBC32-NEXT:    li a0, 32
+; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV64ZVBC32-NEXT:    li a0, 64
+; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV64ZVBC32-NEXT:    li a0, 128
+; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV64ZVBC32-NEXT:    li a0, 256
+; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV64ZVBC32-NEXT:    li a0, 512
+; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV64ZVBC32-NEXT:    li a0, 1024
+; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV64ZVBC32-NEXT:    li a0, 1
+; RV64ZVBC32-NEXT:    slli a0, a0, 11
+; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV64ZVBC32-NEXT:    lui a0, 1
+; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV64ZVBC32-NEXT:    lui a0, 2
+; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV64ZVBC32-NEXT:    lui a0, 4
+; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV64ZVBC32-NEXT:    lui a0, 8
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, a0
+; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vmul.vv v8, v9, v8
+; RV64ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; RV64ZVBC32-NEXT:    vnsrl.wi v8, v8, 16
+; RV64ZVBC32-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x i16> poison, i16 %b, i32 0
   %vb = shufflevector <vscale x 1 x i16> %elt.head, <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer
   %va.ext = zext <vscale x 1 x i16> %va to <vscale x 1 x i32>
@@ -2084,29 +2958,167 @@ define <vscale x 2 x i16> @clmulh_nxv2i16_vv(<vscale x 2 x i16> %va, <vscale x 2
 ; RV64V-NEXT:    vnsrl.wi v8, v8, 16
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmulh_nxv2i16_vv:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV32ZVBC-NEXT:    vzext.vf4 v10, v9
-; RV32ZVBC-NEXT:    vzext.vf4 v12, v8
-; RV32ZVBC-NEXT:    vclmul.vv v8, v12, v10
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v10, v8, 0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v8, v10, 16
-; RV32ZVBC-NEXT:    ret
+; RV32ZVBC64-LABEL: clmulh_nxv2i16_vv:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf4 v10, v9
+; RV32ZVBC64-NEXT:    vzext.vf4 v12, v8
+; RV32ZVBC64-NEXT:    vclmul.vv v8, v12, v10
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v10, v8, 0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v10, 16
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmulh_nxv2i16_vv:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf4 v10, v9
+; RV64ZVBC64-NEXT:    vzext.vf4 v12, v8
+; RV64ZVBC64-NEXT:    vclmul.vv v8, v12, v10
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v10, v8, 0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v10, 16
+; RV64ZVBC64-NEXT:    ret
+;
+; RV32ZVBC32-LABEL: clmulh_nxv2i16_vv:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; RV32ZVBC32-NEXT:    vzext.vf2 v10, v8
+; RV32ZVBC32-NEXT:    vzext.vf2 v8, v9
+; RV32ZVBC32-NEXT:    li a0, 16
+; RV32ZVBC32-NEXT:    vand.vi v9, v8, 2
+; RV32ZVBC32-NEXT:    vand.vi v11, v8, 1
+; RV32ZVBC32-NEXT:    vmul.vv v9, v10, v9
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v11, v9
+; RV32ZVBC32-NEXT:    vand.vi v11, v8, 4
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vand.vi v11, v8, 8
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 32
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 64
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 128
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 256
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 512
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 1024
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 1
+; RV32ZVBC32-NEXT:    slli a0, a0, 11
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV32ZVBC32-NEXT:    lui a0, 1
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV32ZVBC32-NEXT:    lui a0, 2
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV32ZVBC32-NEXT:    lui a0, 4
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV32ZVBC32-NEXT:    lui a0, 8
+; RV32ZVBC32-NEXT:    vand.vx v8, v8, a0
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    vxor.vv v8, v9, v8
+; RV32ZVBC32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; RV32ZVBC32-NEXT:    vnsrl.wi v8, v8, 16
+; RV32ZVBC32-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmulh_nxv2i16_vv:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV64ZVBC-NEXT:    vzext.vf4 v10, v9
-; RV64ZVBC-NEXT:    vzext.vf4 v12, v8
-; RV64ZVBC-NEXT:    vclmul.vv v8, v12, v10
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v10, v8, 0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v8, v10, 16
-; RV64ZVBC-NEXT:    ret
+; RV64ZVBC32-LABEL: clmulh_nxv2i16_vv:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; RV64ZVBC32-NEXT:    vzext.vf2 v10, v8
+; RV64ZVBC32-NEXT:    vzext.vf2 v8, v9
+; RV64ZVBC32-NEXT:    li a0, 16
+; RV64ZVBC32-NEXT:    vand.vi v9, v8, 2
+; RV64ZVBC32-NEXT:    vand.vi v11, v8, 1
+; RV64ZVBC32-NEXT:    vmul.vv v9, v10, v9
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v11, v9
+; RV64ZVBC32-NEXT:    vand.vi v11, v8, 4
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vi v11, v8, 8
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV64ZVBC32-NEXT:    li a0, 32
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV64ZVBC32-NEXT:    li a0, 64
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV64ZVBC32-NEXT:    li a0, 128
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV64ZVBC32-NEXT:    li a0, 256
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV64ZVBC32-NEXT:    li a0, 512
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV64ZVBC32-NEXT:    li a0, 1024
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV64ZVBC32-NEXT:    li a0, 1
+; RV64ZVBC32-NEXT:    slli a0, a0, 11
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV64ZVBC32-NEXT:    lui a0, 1
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV64ZVBC32-NEXT:    lui a0, 2
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV64ZVBC32-NEXT:    lui a0, 4
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV64ZVBC32-NEXT:    lui a0, 8
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, a0
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    vxor.vv v8, v9, v8
+; RV64ZVBC32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; RV64ZVBC32-NEXT:    vnsrl.wi v8, v8, 16
+; RV64ZVBC32-NEXT:    ret
   %va.ext = zext <vscale x 2 x i16> %va to <vscale x 2 x i32>
   %vb.ext = zext <vscale x 2 x i16> %vb to <vscale x 2 x i32>
   %clmul = call <vscale x 2 x i32> @llvm.clmul.nxv2i32(<vscale x 2 x i32> %va.ext, <vscale x 2 x i32> %vb.ext)
@@ -2258,33 +3270,175 @@ define <vscale x 2 x i16> @clmulh_nxv2i16_vx(<vscale x 2 x i16> %va, i16 %b) nou
 ; RV64V-NEXT:    vnsrl.wi v8, v8, 16
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmulh_nxv2i16_vx:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; RV32ZVBC-NEXT:    vmv.v.x v12, a0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; RV32ZVBC-NEXT:    vzext.vf4 v10, v8
-; RV32ZVBC-NEXT:    vzext.vf4 v8, v12
-; RV32ZVBC-NEXT:    vclmul.vv v8, v10, v8
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v10, v8, 0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v8, v10, 16
-; RV32ZVBC-NEXT:    ret
+; RV32ZVBC64-LABEL: clmulh_nxv2i16_vx:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; RV32ZVBC64-NEXT:    vmv.v.x v12, a0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf4 v10, v8
+; RV32ZVBC64-NEXT:    vzext.vf4 v8, v12
+; RV32ZVBC64-NEXT:    vclmul.vv v8, v10, v8
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v10, v8, 0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v10, 16
+; RV32ZVBC64-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmulh_nxv2i16_vx:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; RV64ZVBC-NEXT:    vmv.v.x v12, a0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; RV64ZVBC-NEXT:    vzext.vf4 v10, v8
-; RV64ZVBC-NEXT:    vzext.vf4 v8, v12
-; RV64ZVBC-NEXT:    vclmul.vv v8, v10, v8
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v10, v8, 0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v8, v10, 16
-; RV64ZVBC-NEXT:    ret
+; RV64ZVBC64-LABEL: clmulh_nxv2i16_vx:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; RV64ZVBC64-NEXT:    vmv.v.x v12, a0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf4 v10, v8
+; RV64ZVBC64-NEXT:    vzext.vf4 v8, v12
+; RV64ZVBC64-NEXT:    vclmul.vv v8, v10, v8
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v10, v8, 0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v10, 16
+; RV64ZVBC64-NEXT:    ret
+;
+; RV32ZVBC32-LABEL: clmulh_nxv2i16_vx:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; RV32ZVBC32-NEXT:    vmv.v.x v10, a0
+; RV32ZVBC32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV32ZVBC32-NEXT:    vzext.vf2 v9, v8
+; RV32ZVBC32-NEXT:    li a0, 16
+; RV32ZVBC32-NEXT:    vzext.vf2 v8, v10
+; RV32ZVBC32-NEXT:    vand.vi v10, v8, 2
+; RV32ZVBC32-NEXT:    vand.vi v11, v8, 1
+; RV32ZVBC32-NEXT:    vmul.vv v10, v9, v10
+; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v11, v10
+; RV32ZVBC32-NEXT:    vand.vi v11, v8, 4
+; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vi v11, v8, 8
+; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 32
+; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 64
+; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 128
+; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 256
+; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 512
+; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 1024
+; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 1
+; RV32ZVBC32-NEXT:    slli a0, a0, 11
+; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV32ZVBC32-NEXT:    lui a0, 1
+; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV32ZVBC32-NEXT:    lui a0, 2
+; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV32ZVBC32-NEXT:    lui a0, 4
+; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV32ZVBC32-NEXT:    lui a0, 8
+; RV32ZVBC32-NEXT:    vand.vx v8, v8, a0
+; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vmul.vv v8, v9, v8
+; RV32ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; RV32ZVBC32-NEXT:    vnsrl.wi v8, v8, 16
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmulh_nxv2i16_vx:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; RV64ZVBC32-NEXT:    vmv.v.x v10, a0
+; RV64ZVBC32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV64ZVBC32-NEXT:    vzext.vf2 v9, v8
+; RV64ZVBC32-NEXT:    li a0, 16
+; RV64ZVBC32-NEXT:    vzext.vf2 v8, v10
+; RV64ZVBC32-NEXT:    vand.vi v10, v8, 2
+; RV64ZVBC32-NEXT:    vand.vi v11, v8, 1
+; RV64ZVBC32-NEXT:    vmul.vv v10, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v11, v10
+; RV64ZVBC32-NEXT:    vand.vi v11, v8, 4
+; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vi v11, v8, 8
+; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV64ZVBC32-NEXT:    li a0, 32
+; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV64ZVBC32-NEXT:    li a0, 64
+; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV64ZVBC32-NEXT:    li a0, 128
+; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV64ZVBC32-NEXT:    li a0, 256
+; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV64ZVBC32-NEXT:    li a0, 512
+; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV64ZVBC32-NEXT:    li a0, 1024
+; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV64ZVBC32-NEXT:    li a0, 1
+; RV64ZVBC32-NEXT:    slli a0, a0, 11
+; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV64ZVBC32-NEXT:    lui a0, 1
+; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV64ZVBC32-NEXT:    lui a0, 2
+; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV64ZVBC32-NEXT:    lui a0, 4
+; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV64ZVBC32-NEXT:    lui a0, 8
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, a0
+; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vmul.vv v8, v9, v8
+; RV64ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; RV64ZVBC32-NEXT:    vnsrl.wi v8, v8, 16
+; RV64ZVBC32-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x i16> poison, i16 %b, i32 0
   %vb = shufflevector <vscale x 2 x i16> %elt.head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
   %va.ext = zext <vscale x 2 x i16> %va to <vscale x 2 x i32>
@@ -2434,29 +3588,167 @@ define <vscale x 4 x i16> @clmulh_nxv4i16_vv(<vscale x 4 x i16> %va, <vscale x 4
 ; RV64V-NEXT:    vnsrl.wi v8, v10, 16
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmulh_nxv4i16_vv:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV32ZVBC-NEXT:    vzext.vf4 v12, v9
-; RV32ZVBC-NEXT:    vzext.vf4 v16, v8
-; RV32ZVBC-NEXT:    vclmul.vv v8, v16, v12
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v12, v8, 0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v8, v12, 16
-; RV32ZVBC-NEXT:    ret
+; RV32ZVBC64-LABEL: clmulh_nxv4i16_vv:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf4 v12, v9
+; RV32ZVBC64-NEXT:    vzext.vf4 v16, v8
+; RV32ZVBC64-NEXT:    vclmul.vv v8, v16, v12
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v12, v8, 0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v12, 16
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmulh_nxv4i16_vv:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf4 v12, v9
+; RV64ZVBC64-NEXT:    vzext.vf4 v16, v8
+; RV64ZVBC64-NEXT:    vclmul.vv v8, v16, v12
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v12, v8, 0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v12, 16
+; RV64ZVBC64-NEXT:    ret
+;
+; RV32ZVBC32-LABEL: clmulh_nxv4i16_vv:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; RV32ZVBC32-NEXT:    vzext.vf2 v10, v8
+; RV32ZVBC32-NEXT:    vzext.vf2 v12, v9
+; RV32ZVBC32-NEXT:    li a0, 16
+; RV32ZVBC32-NEXT:    vand.vi v8, v12, 2
+; RV32ZVBC32-NEXT:    vand.vi v14, v12, 1
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV32ZVBC32-NEXT:    vxor.vv v8, v14, v8
+; RV32ZVBC32-NEXT:    vand.vi v14, v12, 4
+; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV32ZVBC32-NEXT:    vand.vi v14, v12, 8
+; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v12, a0
+; RV32ZVBC32-NEXT:    li a0, 32
+; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v12, a0
+; RV32ZVBC32-NEXT:    li a0, 64
+; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v12, a0
+; RV32ZVBC32-NEXT:    li a0, 128
+; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v12, a0
+; RV32ZVBC32-NEXT:    li a0, 256
+; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v12, a0
+; RV32ZVBC32-NEXT:    li a0, 512
+; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v12, a0
+; RV32ZVBC32-NEXT:    li a0, 1024
+; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v12, a0
+; RV32ZVBC32-NEXT:    li a0, 1
+; RV32ZVBC32-NEXT:    slli a0, a0, 11
+; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v12, a0
+; RV32ZVBC32-NEXT:    lui a0, 1
+; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v12, a0
+; RV32ZVBC32-NEXT:    lui a0, 2
+; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v12, a0
+; RV32ZVBC32-NEXT:    lui a0, 4
+; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v12, a0
+; RV32ZVBC32-NEXT:    lui a0, 8
+; RV32ZVBC32-NEXT:    vand.vx v12, v12, a0
+; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV32ZVBC32-NEXT:    vmul.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    vxor.vv v10, v8, v10
+; RV32ZVBC32-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; RV32ZVBC32-NEXT:    vnsrl.wi v8, v10, 16
+; RV32ZVBC32-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmulh_nxv4i16_vv:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV64ZVBC-NEXT:    vzext.vf4 v12, v9
-; RV64ZVBC-NEXT:    vzext.vf4 v16, v8
-; RV64ZVBC-NEXT:    vclmul.vv v8, v16, v12
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v12, v8, 0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v8, v12, 16
-; RV64ZVBC-NEXT:    ret
+; RV64ZVBC32-LABEL: clmulh_nxv4i16_vv:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; RV64ZVBC32-NEXT:    vzext.vf2 v10, v8
+; RV64ZVBC32-NEXT:    vzext.vf2 v12, v9
+; RV64ZVBC32-NEXT:    li a0, 16
+; RV64ZVBC32-NEXT:    vand.vi v8, v12, 2
+; RV64ZVBC32-NEXT:    vand.vi v14, v12, 1
+; RV64ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v8, v14, v8
+; RV64ZVBC32-NEXT:    vand.vi v14, v12, 4
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV64ZVBC32-NEXT:    vand.vi v14, v12, 8
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v12, a0
+; RV64ZVBC32-NEXT:    li a0, 32
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v12, a0
+; RV64ZVBC32-NEXT:    li a0, 64
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v12, a0
+; RV64ZVBC32-NEXT:    li a0, 128
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v12, a0
+; RV64ZVBC32-NEXT:    li a0, 256
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v12, a0
+; RV64ZVBC32-NEXT:    li a0, 512
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v12, a0
+; RV64ZVBC32-NEXT:    li a0, 1024
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v12, a0
+; RV64ZVBC32-NEXT:    li a0, 1
+; RV64ZVBC32-NEXT:    slli a0, a0, 11
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v12, a0
+; RV64ZVBC32-NEXT:    lui a0, 1
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v12, a0
+; RV64ZVBC32-NEXT:    lui a0, 2
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v12, a0
+; RV64ZVBC32-NEXT:    lui a0, 4
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v12, a0
+; RV64ZVBC32-NEXT:    lui a0, 8
+; RV64ZVBC32-NEXT:    vand.vx v12, v12, a0
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV64ZVBC32-NEXT:    vmul.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vxor.vv v10, v8, v10
+; RV64ZVBC32-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; RV64ZVBC32-NEXT:    vnsrl.wi v8, v10, 16
+; RV64ZVBC32-NEXT:    ret
   %va.ext = zext <vscale x 4 x i16> %va to <vscale x 4 x i32>
   %vb.ext = zext <vscale x 4 x i16> %vb to <vscale x 4 x i32>
   %clmul = call <vscale x 4 x i32> @llvm.clmul.nxv4i32(<vscale x 4 x i32> %va.ext, <vscale x 4 x i32> %vb.ext)
@@ -2608,33 +3900,175 @@ define <vscale x 4 x i16> @clmulh_nxv4i16_vx(<vscale x 4 x i16> %va, i16 %b) nou
 ; RV64V-NEXT:    vnsrl.wi v8, v10, 16
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmulh_nxv4i16_vx:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; RV32ZVBC-NEXT:    vmv.v.x v16, a0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
-; RV32ZVBC-NEXT:    vzext.vf4 v12, v8
-; RV32ZVBC-NEXT:    vzext.vf4 v8, v16
-; RV32ZVBC-NEXT:    vclmul.vv v8, v12, v8
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v12, v8, 0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v8, v12, 16
-; RV32ZVBC-NEXT:    ret
+; RV32ZVBC64-LABEL: clmulh_nxv4i16_vx:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; RV32ZVBC64-NEXT:    vmv.v.x v16, a0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf4 v12, v8
+; RV32ZVBC64-NEXT:    vzext.vf4 v8, v16
+; RV32ZVBC64-NEXT:    vclmul.vv v8, v12, v8
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v12, v8, 0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v12, 16
+; RV32ZVBC64-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmulh_nxv4i16_vx:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; RV64ZVBC-NEXT:    vmv.v.x v16, a0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
-; RV64ZVBC-NEXT:    vzext.vf4 v12, v8
-; RV64ZVBC-NEXT:    vzext.vf4 v8, v16
-; RV64ZVBC-NEXT:    vclmul.vv v8, v12, v8
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v12, v8, 0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v8, v12, 16
-; RV64ZVBC-NEXT:    ret
+; RV64ZVBC64-LABEL: clmulh_nxv4i16_vx:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; RV64ZVBC64-NEXT:    vmv.v.x v16, a0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf4 v12, v8
+; RV64ZVBC64-NEXT:    vzext.vf4 v8, v16
+; RV64ZVBC64-NEXT:    vclmul.vv v8, v12, v8
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v12, v8, 0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v12, 16
+; RV64ZVBC64-NEXT:    ret
+;
+; RV32ZVBC32-LABEL: clmulh_nxv4i16_vx:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; RV32ZVBC32-NEXT:    vmv.v.x v12, a0
+; RV32ZVBC32-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVBC32-NEXT:    vzext.vf2 v10, v8
+; RV32ZVBC32-NEXT:    li a0, 16
+; RV32ZVBC32-NEXT:    vzext.vf2 v8, v12
+; RV32ZVBC32-NEXT:    vand.vi v12, v8, 2
+; RV32ZVBC32-NEXT:    vand.vi v14, v8, 1
+; RV32ZVBC32-NEXT:    vmul.vv v12, v10, v12
+; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v14, v12
+; RV32ZVBC32-NEXT:    vand.vi v14, v8, 4
+; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vi v14, v8, 8
+; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 32
+; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 64
+; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 128
+; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 256
+; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 512
+; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 1024
+; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 1
+; RV32ZVBC32-NEXT:    slli a0, a0, 11
+; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v8, a0
+; RV32ZVBC32-NEXT:    lui a0, 1
+; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v8, a0
+; RV32ZVBC32-NEXT:    lui a0, 2
+; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v8, a0
+; RV32ZVBC32-NEXT:    lui a0, 4
+; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v8, a0
+; RV32ZVBC32-NEXT:    lui a0, 8
+; RV32ZVBC32-NEXT:    vand.vx v8, v8, a0
+; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    vxor.vv v10, v12, v8
+; RV32ZVBC32-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; RV32ZVBC32-NEXT:    vnsrl.wi v8, v10, 16
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmulh_nxv4i16_vx:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; RV64ZVBC32-NEXT:    vmv.v.x v12, a0
+; RV64ZVBC32-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV64ZVBC32-NEXT:    vzext.vf2 v10, v8
+; RV64ZVBC32-NEXT:    li a0, 16
+; RV64ZVBC32-NEXT:    vzext.vf2 v8, v12
+; RV64ZVBC32-NEXT:    vand.vi v12, v8, 2
+; RV64ZVBC32-NEXT:    vand.vi v14, v8, 1
+; RV64ZVBC32-NEXT:    vmul.vv v12, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v14, v12
+; RV64ZVBC32-NEXT:    vand.vi v14, v8, 4
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vi v14, v8, 8
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v8, a0
+; RV64ZVBC32-NEXT:    li a0, 32
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v8, a0
+; RV64ZVBC32-NEXT:    li a0, 64
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v8, a0
+; RV64ZVBC32-NEXT:    li a0, 128
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v8, a0
+; RV64ZVBC32-NEXT:    li a0, 256
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v8, a0
+; RV64ZVBC32-NEXT:    li a0, 512
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v8, a0
+; RV64ZVBC32-NEXT:    li a0, 1024
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v8, a0
+; RV64ZVBC32-NEXT:    li a0, 1
+; RV64ZVBC32-NEXT:    slli a0, a0, 11
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v8, a0
+; RV64ZVBC32-NEXT:    lui a0, 1
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v8, a0
+; RV64ZVBC32-NEXT:    lui a0, 2
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v8, a0
+; RV64ZVBC32-NEXT:    lui a0, 4
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v8, a0
+; RV64ZVBC32-NEXT:    lui a0, 8
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, a0
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    vxor.vv v10, v12, v8
+; RV64ZVBC32-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; RV64ZVBC32-NEXT:    vnsrl.wi v8, v10, 16
+; RV64ZVBC32-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x i16> poison, i16 %b, i32 0
   %vb = shufflevector <vscale x 4 x i16> %elt.head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
   %va.ext = zext <vscale x 4 x i16> %va to <vscale x 4 x i32>
@@ -2784,29 +4218,167 @@ define <vscale x 8 x i16> @clmulh_nxv8i16_vv(<vscale x 8 x i16> %va, <vscale x 8
 ; RV64V-NEXT:    vnsrl.wi v8, v12, 16
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmulh_nxv8i16_vv:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV32ZVBC-NEXT:    vzext.vf4 v16, v10
-; RV32ZVBC-NEXT:    vzext.vf4 v24, v8
-; RV32ZVBC-NEXT:    vclmul.vv v8, v24, v16
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v16, v8, 0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v8, v16, 16
-; RV32ZVBC-NEXT:    ret
+; RV32ZVBC64-LABEL: clmulh_nxv8i16_vv:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf4 v16, v10
+; RV32ZVBC64-NEXT:    vzext.vf4 v24, v8
+; RV32ZVBC64-NEXT:    vclmul.vv v8, v24, v16
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v16, v8, 0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v16, 16
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmulh_nxv8i16_vv:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf4 v16, v10
+; RV64ZVBC64-NEXT:    vzext.vf4 v24, v8
+; RV64ZVBC64-NEXT:    vclmul.vv v8, v24, v16
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v16, v8, 0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v16, 16
+; RV64ZVBC64-NEXT:    ret
+;
+; RV32ZVBC32-LABEL: clmulh_nxv8i16_vv:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; RV32ZVBC32-NEXT:    vzext.vf2 v12, v8
+; RV32ZVBC32-NEXT:    vzext.vf2 v16, v10
+; RV32ZVBC32-NEXT:    li a0, 16
+; RV32ZVBC32-NEXT:    vand.vi v8, v16, 2
+; RV32ZVBC32-NEXT:    vand.vi v20, v16, 1
+; RV32ZVBC32-NEXT:    vmul.vv v8, v12, v8
+; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV32ZVBC32-NEXT:    vxor.vv v8, v20, v8
+; RV32ZVBC32-NEXT:    vand.vi v20, v16, 4
+; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v20
+; RV32ZVBC32-NEXT:    vand.vi v20, v16, 8
+; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v16, a0
+; RV32ZVBC32-NEXT:    li a0, 32
+; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v16, a0
+; RV32ZVBC32-NEXT:    li a0, 64
+; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v16, a0
+; RV32ZVBC32-NEXT:    li a0, 128
+; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v16, a0
+; RV32ZVBC32-NEXT:    li a0, 256
+; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v16, a0
+; RV32ZVBC32-NEXT:    li a0, 512
+; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v16, a0
+; RV32ZVBC32-NEXT:    li a0, 1024
+; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v16, a0
+; RV32ZVBC32-NEXT:    li a0, 1
+; RV32ZVBC32-NEXT:    slli a0, a0, 11
+; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v16, a0
+; RV32ZVBC32-NEXT:    lui a0, 1
+; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v16, a0
+; RV32ZVBC32-NEXT:    lui a0, 2
+; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v16, a0
+; RV32ZVBC32-NEXT:    lui a0, 4
+; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v16, a0
+; RV32ZVBC32-NEXT:    lui a0, 8
+; RV32ZVBC32-NEXT:    vand.vx v16, v16, a0
+; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v20
+; RV32ZVBC32-NEXT:    vmul.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    vxor.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV32ZVBC32-NEXT:    vnsrl.wi v8, v12, 16
+; RV32ZVBC32-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmulh_nxv8i16_vv:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV64ZVBC-NEXT:    vzext.vf4 v16, v10
-; RV64ZVBC-NEXT:    vzext.vf4 v24, v8
-; RV64ZVBC-NEXT:    vclmul.vv v8, v24, v16
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v16, v8, 0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v8, v16, 16
-; RV64ZVBC-NEXT:    ret
+; RV64ZVBC32-LABEL: clmulh_nxv8i16_vv:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; RV64ZVBC32-NEXT:    vzext.vf2 v12, v8
+; RV64ZVBC32-NEXT:    vzext.vf2 v16, v10
+; RV64ZVBC32-NEXT:    li a0, 16
+; RV64ZVBC32-NEXT:    vand.vi v8, v16, 2
+; RV64ZVBC32-NEXT:    vand.vi v20, v16, 1
+; RV64ZVBC32-NEXT:    vmul.vv v8, v12, v8
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v8, v20, v8
+; RV64ZVBC32-NEXT:    vand.vi v20, v16, 4
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
+; RV64ZVBC32-NEXT:    vand.vi v20, v16, 8
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v16, a0
+; RV64ZVBC32-NEXT:    li a0, 32
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v16, a0
+; RV64ZVBC32-NEXT:    li a0, 64
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v16, a0
+; RV64ZVBC32-NEXT:    li a0, 128
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v16, a0
+; RV64ZVBC32-NEXT:    li a0, 256
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v16, a0
+; RV64ZVBC32-NEXT:    li a0, 512
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v16, a0
+; RV64ZVBC32-NEXT:    li a0, 1024
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v16, a0
+; RV64ZVBC32-NEXT:    li a0, 1
+; RV64ZVBC32-NEXT:    slli a0, a0, 11
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v16, a0
+; RV64ZVBC32-NEXT:    lui a0, 1
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v16, a0
+; RV64ZVBC32-NEXT:    lui a0, 2
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v16, a0
+; RV64ZVBC32-NEXT:    lui a0, 4
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v16, a0
+; RV64ZVBC32-NEXT:    lui a0, 8
+; RV64ZVBC32-NEXT:    vand.vx v16, v16, a0
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
+; RV64ZVBC32-NEXT:    vmul.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    vxor.vv v12, v8, v12
+; RV64ZVBC32-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV64ZVBC32-NEXT:    vnsrl.wi v8, v12, 16
+; RV64ZVBC32-NEXT:    ret
   %va.ext = zext <vscale x 8 x i16> %va to <vscale x 8 x i32>
   %vb.ext = zext <vscale x 8 x i16> %vb to <vscale x 8 x i32>
   %clmul = call <vscale x 8 x i32> @llvm.clmul.nxv8i32(<vscale x 8 x i32> %va.ext, <vscale x 8 x i32> %vb.ext)
@@ -2958,33 +4530,175 @@ define <vscale x 8 x i16> @clmulh_nxv8i16_vx(<vscale x 8 x i16> %va, i16 %b) nou
 ; RV64V-NEXT:    vnsrl.wi v8, v12, 16
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmulh_nxv8i16_vx:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; RV32ZVBC-NEXT:    vmv.v.x v24, a0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV32ZVBC-NEXT:    vzext.vf4 v16, v8
-; RV32ZVBC-NEXT:    vzext.vf4 v8, v24
-; RV32ZVBC-NEXT:    vclmul.vv v8, v16, v8
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v16, v8, 0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wi v8, v16, 16
-; RV32ZVBC-NEXT:    ret
+; RV32ZVBC64-LABEL: clmulh_nxv8i16_vx:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; RV32ZVBC64-NEXT:    vmv.v.x v24, a0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf4 v16, v8
+; RV32ZVBC64-NEXT:    vzext.vf4 v8, v24
+; RV32ZVBC64-NEXT:    vclmul.vv v8, v16, v8
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v16, v8, 0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v16, 16
+; RV32ZVBC64-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmulh_nxv8i16_vx:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; RV64ZVBC-NEXT:    vmv.v.x v24, a0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV64ZVBC-NEXT:    vzext.vf4 v16, v8
-; RV64ZVBC-NEXT:    vzext.vf4 v8, v24
-; RV64ZVBC-NEXT:    vclmul.vv v8, v16, v8
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v16, v8, 0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wi v8, v16, 16
-; RV64ZVBC-NEXT:    ret
+; RV64ZVBC64-LABEL: clmulh_nxv8i16_vx:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; RV64ZVBC64-NEXT:    vmv.v.x v24, a0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf4 v16, v8
+; RV64ZVBC64-NEXT:    vzext.vf4 v8, v24
+; RV64ZVBC64-NEXT:    vclmul.vv v8, v16, v8
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v16, v8, 0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v16, 16
+; RV64ZVBC64-NEXT:    ret
+;
+; RV32ZVBC32-LABEL: clmulh_nxv8i16_vx:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; RV32ZVBC32-NEXT:    vmv.v.x v16, a0
+; RV32ZVBC32-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV32ZVBC32-NEXT:    vzext.vf2 v12, v8
+; RV32ZVBC32-NEXT:    li a0, 16
+; RV32ZVBC32-NEXT:    vzext.vf2 v8, v16
+; RV32ZVBC32-NEXT:    vand.vi v16, v8, 2
+; RV32ZVBC32-NEXT:    vand.vi v20, v8, 1
+; RV32ZVBC32-NEXT:    vmul.vv v16, v12, v16
+; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV32ZVBC32-NEXT:    vxor.vv v16, v20, v16
+; RV32ZVBC32-NEXT:    vand.vi v20, v8, 4
+; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    vand.vi v20, v8, 8
+; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 32
+; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 64
+; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 128
+; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 256
+; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 512
+; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 1024
+; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 1
+; RV32ZVBC32-NEXT:    slli a0, a0, 11
+; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v8, a0
+; RV32ZVBC32-NEXT:    lui a0, 1
+; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v8, a0
+; RV32ZVBC32-NEXT:    lui a0, 2
+; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v8, a0
+; RV32ZVBC32-NEXT:    lui a0, 4
+; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v8, a0
+; RV32ZVBC32-NEXT:    lui a0, 8
+; RV32ZVBC32-NEXT:    vand.vx v8, v8, a0
+; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    vmul.vv v8, v12, v8
+; RV32ZVBC32-NEXT:    vxor.vv v12, v16, v8
+; RV32ZVBC32-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV32ZVBC32-NEXT:    vnsrl.wi v8, v12, 16
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmulh_nxv8i16_vx:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; RV64ZVBC32-NEXT:    vmv.v.x v16, a0
+; RV64ZVBC32-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV64ZVBC32-NEXT:    vzext.vf2 v12, v8
+; RV64ZVBC32-NEXT:    li a0, 16
+; RV64ZVBC32-NEXT:    vzext.vf2 v8, v16
+; RV64ZVBC32-NEXT:    vand.vi v16, v8, 2
+; RV64ZVBC32-NEXT:    vand.vi v20, v8, 1
+; RV64ZVBC32-NEXT:    vmul.vv v16, v12, v16
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v20, v16
+; RV64ZVBC32-NEXT:    vand.vi v20, v8, 4
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vi v20, v8, 8
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v8, a0
+; RV64ZVBC32-NEXT:    li a0, 32
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v8, a0
+; RV64ZVBC32-NEXT:    li a0, 64
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v8, a0
+; RV64ZVBC32-NEXT:    li a0, 128
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v8, a0
+; RV64ZVBC32-NEXT:    li a0, 256
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v8, a0
+; RV64ZVBC32-NEXT:    li a0, 512
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v8, a0
+; RV64ZVBC32-NEXT:    li a0, 1024
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v8, a0
+; RV64ZVBC32-NEXT:    li a0, 1
+; RV64ZVBC32-NEXT:    slli a0, a0, 11
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v8, a0
+; RV64ZVBC32-NEXT:    lui a0, 1
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v8, a0
+; RV64ZVBC32-NEXT:    lui a0, 2
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v8, a0
+; RV64ZVBC32-NEXT:    lui a0, 4
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v8, a0
+; RV64ZVBC32-NEXT:    lui a0, 8
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, a0
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vmul.vv v8, v12, v8
+; RV64ZVBC32-NEXT:    vxor.vv v12, v16, v8
+; RV64ZVBC32-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV64ZVBC32-NEXT:    vnsrl.wi v8, v12, 16
+; RV64ZVBC32-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x i16> poison, i16 %b, i32 0
   %vb = shufflevector <vscale x 8 x i16> %elt.head, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
   %va.ext = zext <vscale x 8 x i16> %va to <vscale x 8 x i32>
@@ -4639,27 +6353,299 @@ define <vscale x 1 x i32> @clmulh_nxv1i32_vv(<vscale x 1 x i32> %va, <vscale x 1
 ; RV64V-NEXT:    vnsrl.wx v8, v8, a0
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmulh_nxv1i32_vv:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV32ZVBC-NEXT:    vzext.vf2 v10, v9
-; RV32ZVBC-NEXT:    vzext.vf2 v9, v8
-; RV32ZVBC-NEXT:    vclmul.vv v8, v9, v10
-; RV32ZVBC-NEXT:    li a0, 32
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wx v8, v8, a0
-; RV32ZVBC-NEXT:    ret
+; RV32ZVBC64-LABEL: clmulh_nxv1i32_vv:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf2 v10, v9
+; RV32ZVBC64-NEXT:    vzext.vf2 v9, v8
+; RV32ZVBC64-NEXT:    vclmul.vv v8, v9, v10
+; RV32ZVBC64-NEXT:    li a0, 32
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wx v8, v8, a0
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmulh_nxv1i32_vv:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf2 v10, v9
+; RV64ZVBC64-NEXT:    vzext.vf2 v9, v8
+; RV64ZVBC64-NEXT:    vclmul.vv v8, v9, v10
+; RV64ZVBC64-NEXT:    li a0, 32
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wx v8, v8, a0
+; RV64ZVBC64-NEXT:    ret
+;
+; RV32ZVBC32-LABEL: clmulh_nxv1i32_vv:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    addi sp, sp, -16
+; RV32ZVBC32-NEXT:    lui a0, 524288
+; RV32ZVBC32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32ZVBC32-NEXT:    vzext.vf2 v10, v8
+; RV32ZVBC32-NEXT:    vzext.vf2 v8, v9
+; RV32ZVBC32-NEXT:    li a2, 16
+; RV32ZVBC32-NEXT:    li a1, 32
+; RV32ZVBC32-NEXT:    vand.vx v9, v8, a2
+; RV32ZVBC32-NEXT:    li a2, 64
+; RV32ZVBC32-NEXT:    vand.vi v11, v8, 2
+; RV32ZVBC32-NEXT:    vand.vi v12, v8, 1
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vmul.vv v12, v10, v12
+; RV32ZVBC32-NEXT:    vxor.vv v11, v12, v11
+; RV32ZVBC32-NEXT:    vand.vi v12, v8, 4
+; RV32ZVBC32-NEXT:    vmul.vv v12, v10, v12
+; RV32ZVBC32-NEXT:    vxor.vv v11, v11, v12
+; RV32ZVBC32-NEXT:    vand.vi v12, v8, 8
+; RV32ZVBC32-NEXT:    vmul.vv v12, v10, v12
+; RV32ZVBC32-NEXT:    vxor.vv v11, v11, v12
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, a2
+; RV32ZVBC32-NEXT:    li a2, 128
+; RV32ZVBC32-NEXT:    vmul.vv v9, v10, v9
+; RV32ZVBC32-NEXT:    vxor.vv v9, v11, v9
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a1
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a2
+; RV32ZVBC32-NEXT:    li a2, 256
+; RV32ZVBC32-NEXT:    vmul.vv v12, v10, v12
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v12
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, a2
+; RV32ZVBC32-NEXT:    li a2, 512
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a2
+; RV32ZVBC32-NEXT:    li a2, 1024
+; RV32ZVBC32-NEXT:    vmul.vv v12, v10, v12
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v12
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 1
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a2
+; RV32ZVBC32-NEXT:    li a2, 1
+; RV32ZVBC32-NEXT:    slli a2, a2, 11
+; RV32ZVBC32-NEXT:    vmul.vv v12, v10, v12
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v12
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 2
+; RV32ZVBC32-NEXT:    vmul.vv v12, v10, v12
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v12
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 4
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 8
+; RV32ZVBC32-NEXT:    vmul.vv v12, v10, v12
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v12
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 16
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 32
+; RV32ZVBC32-NEXT:    vmul.vv v12, v10, v12
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v12
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 64
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 128
+; RV32ZVBC32-NEXT:    vmul.vv v12, v10, v12
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v12
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 256
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 512
+; RV32ZVBC32-NEXT:    vmul.vv v12, v10, v12
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v12
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 1024
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 2048
+; RV32ZVBC32-NEXT:    vmul.vv v12, v10, v12
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v12
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 4096
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 8192
+; RV32ZVBC32-NEXT:    vmul.vv v12, v10, v12
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v12
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 16384
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 32768
+; RV32ZVBC32-NEXT:    vmul.vv v12, v10, v12
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v12
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 65536
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 131072
+; RV32ZVBC32-NEXT:    vmul.vv v12, v10, v12
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v12
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, a2
+; RV32ZVBC32-NEXT:    addi a2, sp, 8
+; RV32ZVBC32-NEXT:    sw a0, 8(sp)
+; RV32ZVBC32-NEXT:    sw zero, 12(sp)
+; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vlse64.v v11, (a2), zero
+; RV32ZVBC32-NEXT:    vmul.vv v12, v10, v12
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v12
+; RV32ZVBC32-NEXT:    lui a0, 262144
+; RV32ZVBC32-NEXT:    vand.vv v11, v8, v11
+; RV32ZVBC32-NEXT:    vand.vx v8, v8, a0
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    vxor.vv v8, v9, v8
+; RV32ZVBC32-NEXT:    vmul.vv v9, v10, v11
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v9
+; RV32ZVBC32-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV32ZVBC32-NEXT:    vnsrl.wx v8, v8, a1
+; RV32ZVBC32-NEXT:    addi sp, sp, 16
+; RV32ZVBC32-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmulh_nxv1i32_vv:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV64ZVBC-NEXT:    vzext.vf2 v10, v9
-; RV64ZVBC-NEXT:    vzext.vf2 v9, v8
-; RV64ZVBC-NEXT:    vclmul.vv v8, v9, v10
-; RV64ZVBC-NEXT:    li a0, 32
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wx v8, v8, a0
-; RV64ZVBC-NEXT:    ret
+; RV64ZVBC32-LABEL: clmulh_nxv1i32_vv:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV64ZVBC32-NEXT:    vzext.vf2 v10, v8
+; RV64ZVBC32-NEXT:    vzext.vf2 v8, v9
+; RV64ZVBC32-NEXT:    li a1, 16
+; RV64ZVBC32-NEXT:    li a0, 32
+; RV64ZVBC32-NEXT:    li a2, 64
+; RV64ZVBC32-NEXT:    vand.vi v9, v8, 2
+; RV64ZVBC32-NEXT:    vand.vi v11, v8, 1
+; RV64ZVBC32-NEXT:    vmul.vv v9, v10, v9
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v11, v9
+; RV64ZVBC32-NEXT:    vand.vi v11, v8, 4
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vi v11, v8, 8
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a1
+; RV64ZVBC32-NEXT:    li a1, 128
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
+; RV64ZVBC32-NEXT:    li a2, 256
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a1
+; RV64ZVBC32-NEXT:    li a1, 512
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
+; RV64ZVBC32-NEXT:    li a2, 1024
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a1
+; RV64ZVBC32-NEXT:    li a1, 1
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 11
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 1
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 2
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 4
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 8
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 16
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 32
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 64
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 128
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 256
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 512
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 1024
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 2048
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 4096
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 8192
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 16384
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 32768
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 65536
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 131072
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 262144
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
+; RV64ZVBC32-NEXT:    slli a1, a1, 31
+; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, a1
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    vxor.vv v8, v9, v8
+; RV64ZVBC32-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV64ZVBC32-NEXT:    vnsrl.wx v8, v8, a0
+; RV64ZVBC32-NEXT:    ret
   %va.ext = zext <vscale x 1 x i32> %va to <vscale x 1 x i64>
   %vb.ext = zext <vscale x 1 x i32> %vb to <vscale x 1 x i64>
   %clmul = call <vscale x 1 x i64> @llvm.clmul.nxv1i64(<vscale x 1 x i64> %va.ext, <vscale x 1 x i64> %vb.ext)
@@ -4945,31 +6931,307 @@ define <vscale x 1 x i32> @clmulh_nxv1i32_vx(<vscale x 1 x i32> %va, i32 %b) nou
 ; RV64V-NEXT:    vnsrl.wx v8, v8, a0
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmulh_nxv1i32_vx:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
-; RV32ZVBC-NEXT:    vmv.v.x v9, a0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
-; RV32ZVBC-NEXT:    vzext.vf2 v10, v8
-; RV32ZVBC-NEXT:    vzext.vf2 v8, v9
-; RV32ZVBC-NEXT:    vclmul.vv v8, v10, v8
-; RV32ZVBC-NEXT:    li a0, 32
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wx v8, v8, a0
-; RV32ZVBC-NEXT:    ret
+; RV32ZVBC64-LABEL: clmulh_nxv1i32_vx:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
+; RV32ZVBC64-NEXT:    vmv.v.x v9, a0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf2 v10, v8
+; RV32ZVBC64-NEXT:    vzext.vf2 v8, v9
+; RV32ZVBC64-NEXT:    vclmul.vv v8, v10, v8
+; RV32ZVBC64-NEXT:    li a0, 32
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wx v8, v8, a0
+; RV32ZVBC64-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmulh_nxv1i32_vx:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
-; RV64ZVBC-NEXT:    vmv.v.x v9, a0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
-; RV64ZVBC-NEXT:    vzext.vf2 v10, v8
-; RV64ZVBC-NEXT:    vzext.vf2 v8, v9
-; RV64ZVBC-NEXT:    vclmul.vv v8, v10, v8
-; RV64ZVBC-NEXT:    li a0, 32
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wx v8, v8, a0
-; RV64ZVBC-NEXT:    ret
+; RV64ZVBC64-LABEL: clmulh_nxv1i32_vx:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
+; RV64ZVBC64-NEXT:    vmv.v.x v9, a0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf2 v10, v8
+; RV64ZVBC64-NEXT:    vzext.vf2 v8, v9
+; RV64ZVBC64-NEXT:    vclmul.vv v8, v10, v8
+; RV64ZVBC64-NEXT:    li a0, 32
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wx v8, v8, a0
+; RV64ZVBC64-NEXT:    ret
+;
+; RV32ZVBC32-LABEL: clmulh_nxv1i32_vx:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    addi sp, sp, -16
+; RV32ZVBC32-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
+; RV32ZVBC32-NEXT:    vmv.v.x v10, a0
+; RV32ZVBC32-NEXT:    lui a0, 524288
+; RV32ZVBC32-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; RV32ZVBC32-NEXT:    vzext.vf2 v9, v8
+; RV32ZVBC32-NEXT:    li a2, 16
+; RV32ZVBC32-NEXT:    li a1, 32
+; RV32ZVBC32-NEXT:    vzext.vf2 v8, v10
+; RV32ZVBC32-NEXT:    vand.vx v10, v8, a2
+; RV32ZVBC32-NEXT:    li a2, 64
+; RV32ZVBC32-NEXT:    vand.vi v11, v8, 2
+; RV32ZVBC32-NEXT:    vand.vi v12, v8, 1
+; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV32ZVBC32-NEXT:    vmul.vv v12, v9, v12
+; RV32ZVBC32-NEXT:    vxor.vv v11, v12, v11
+; RV32ZVBC32-NEXT:    vand.vi v12, v8, 4
+; RV32ZVBC32-NEXT:    vmul.vv v12, v9, v12
+; RV32ZVBC32-NEXT:    vxor.vv v11, v11, v12
+; RV32ZVBC32-NEXT:    vand.vi v12, v8, 8
+; RV32ZVBC32-NEXT:    vmul.vv v12, v9, v12
+; RV32ZVBC32-NEXT:    vxor.vv v11, v11, v12
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, a2
+; RV32ZVBC32-NEXT:    li a2, 128
+; RV32ZVBC32-NEXT:    vmul.vv v10, v9, v10
+; RV32ZVBC32-NEXT:    vxor.vv v10, v11, v10
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a1
+; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a2
+; RV32ZVBC32-NEXT:    li a2, 256
+; RV32ZVBC32-NEXT:    vmul.vv v12, v9, v12
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, a2
+; RV32ZVBC32-NEXT:    li a2, 512
+; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a2
+; RV32ZVBC32-NEXT:    li a2, 1024
+; RV32ZVBC32-NEXT:    vmul.vv v12, v9, v12
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, a2
+; RV32ZVBC32-NEXT:    li a2, 1
+; RV32ZVBC32-NEXT:    slli a2, a2, 11
+; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 1
+; RV32ZVBC32-NEXT:    vmul.vv v12, v9, v12
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 2
+; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 4
+; RV32ZVBC32-NEXT:    vmul.vv v12, v9, v12
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 8
+; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 16
+; RV32ZVBC32-NEXT:    vmul.vv v12, v9, v12
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 32
+; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 64
+; RV32ZVBC32-NEXT:    vmul.vv v12, v9, v12
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 128
+; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 256
+; RV32ZVBC32-NEXT:    vmul.vv v12, v9, v12
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 512
+; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 1024
+; RV32ZVBC32-NEXT:    vmul.vv v12, v9, v12
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 2048
+; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 4096
+; RV32ZVBC32-NEXT:    vmul.vv v12, v9, v12
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 8192
+; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 16384
+; RV32ZVBC32-NEXT:    vmul.vv v12, v9, v12
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 32768
+; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 65536
+; RV32ZVBC32-NEXT:    vmul.vv v12, v9, v12
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 131072
+; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vand.vx v11, v8, a2
+; RV32ZVBC32-NEXT:    addi a2, sp, 8
+; RV32ZVBC32-NEXT:    sw a0, 8(sp)
+; RV32ZVBC32-NEXT:    sw zero, 12(sp)
+; RV32ZVBC32-NEXT:    vmul.vv v12, v9, v12
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    vlse64.v v12, (a2), zero
+; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    lui a0, 262144
+; RV32ZVBC32-NEXT:    vand.vv v11, v8, v12
+; RV32ZVBC32-NEXT:    vand.vx v8, v8, a0
+; RV32ZVBC32-NEXT:    vmul.vv v8, v9, v8
+; RV32ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    vmul.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v9
+; RV32ZVBC32-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV32ZVBC32-NEXT:    vnsrl.wx v8, v8, a1
+; RV32ZVBC32-NEXT:    addi sp, sp, 16
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmulh_nxv1i32_vx:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
+; RV64ZVBC32-NEXT:    vmv.v.x v10, a0
+; RV64ZVBC32-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; RV64ZVBC32-NEXT:    vzext.vf2 v9, v8
+; RV64ZVBC32-NEXT:    li a1, 16
+; RV64ZVBC32-NEXT:    li a0, 32
+; RV64ZVBC32-NEXT:    li a2, 64
+; RV64ZVBC32-NEXT:    vzext.vf2 v8, v10
+; RV64ZVBC32-NEXT:    vand.vi v10, v8, 2
+; RV64ZVBC32-NEXT:    vand.vi v11, v8, 1
+; RV64ZVBC32-NEXT:    vmul.vv v10, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v11, v10
+; RV64ZVBC32-NEXT:    vand.vi v11, v8, 4
+; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vi v11, v8, 8
+; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a1
+; RV64ZVBC32-NEXT:    li a1, 128
+; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
+; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
+; RV64ZVBC32-NEXT:    li a2, 256
+; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a1
+; RV64ZVBC32-NEXT:    li a1, 512
+; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
+; RV64ZVBC32-NEXT:    li a2, 1024
+; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a1
+; RV64ZVBC32-NEXT:    li a1, 1
+; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 11
+; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 1
+; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 2
+; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 4
+; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 8
+; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 16
+; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 32
+; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 64
+; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 128
+; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 256
+; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 512
+; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 1024
+; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 2048
+; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 4096
+; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 8192
+; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 16384
+; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 32768
+; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 65536
+; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 131072
+; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 262144
+; RV64ZVBC32-NEXT:    slli a1, a1, 31
+; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, a1
+; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vmul.vv v8, v9, v8
+; RV64ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV64ZVBC32-NEXT:    vnsrl.wx v8, v8, a0
+; RV64ZVBC32-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x i32> poison, i32 %b, i64 0
   %vb = shufflevector <vscale x 1 x i32> %elt.head, <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
   %va.ext = zext <vscale x 1 x i32> %va to <vscale x 1 x i64>
@@ -5253,27 +7515,299 @@ define <vscale x 2 x i32> @clmulh_nxv2i32_vv(<vscale x 2 x i32> %va, <vscale x 2
 ; RV64V-NEXT:    vnsrl.wx v8, v10, a0
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmulh_nxv2i32_vv:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV32ZVBC-NEXT:    vzext.vf2 v10, v9
-; RV32ZVBC-NEXT:    vzext.vf2 v12, v8
-; RV32ZVBC-NEXT:    vclmul.vv v10, v12, v10
-; RV32ZVBC-NEXT:    li a0, 32
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wx v8, v10, a0
-; RV32ZVBC-NEXT:    ret
+; RV32ZVBC64-LABEL: clmulh_nxv2i32_vv:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf2 v10, v9
+; RV32ZVBC64-NEXT:    vzext.vf2 v12, v8
+; RV32ZVBC64-NEXT:    vclmul.vv v10, v12, v10
+; RV32ZVBC64-NEXT:    li a0, 32
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wx v8, v10, a0
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmulh_nxv2i32_vv:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf2 v10, v9
+; RV64ZVBC64-NEXT:    vzext.vf2 v12, v8
+; RV64ZVBC64-NEXT:    vclmul.vv v10, v12, v10
+; RV64ZVBC64-NEXT:    li a0, 32
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wx v8, v10, a0
+; RV64ZVBC64-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmulh_nxv2i32_vv:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV64ZVBC-NEXT:    vzext.vf2 v10, v9
-; RV64ZVBC-NEXT:    vzext.vf2 v12, v8
-; RV64ZVBC-NEXT:    vclmul.vv v10, v12, v10
-; RV64ZVBC-NEXT:    li a0, 32
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wx v8, v10, a0
-; RV64ZVBC-NEXT:    ret
+; RV32ZVBC32-LABEL: clmulh_nxv2i32_vv:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    addi sp, sp, -16
+; RV32ZVBC32-NEXT:    lui a0, 524288
+; RV32ZVBC32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32ZVBC32-NEXT:    vzext.vf2 v10, v8
+; RV32ZVBC32-NEXT:    vzext.vf2 v12, v9
+; RV32ZVBC32-NEXT:    li a2, 16
+; RV32ZVBC32-NEXT:    li a1, 32
+; RV32ZVBC32-NEXT:    vand.vx v8, v12, a2
+; RV32ZVBC32-NEXT:    li a2, 64
+; RV32ZVBC32-NEXT:    vand.vi v14, v12, 2
+; RV32ZVBC32-NEXT:    vand.vi v16, v12, 1
+; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV32ZVBC32-NEXT:    vmul.vv v16, v10, v16
+; RV32ZVBC32-NEXT:    vxor.vv v14, v16, v14
+; RV32ZVBC32-NEXT:    vand.vi v16, v12, 4
+; RV32ZVBC32-NEXT:    vmul.vv v16, v10, v16
+; RV32ZVBC32-NEXT:    vxor.vv v14, v14, v16
+; RV32ZVBC32-NEXT:    vand.vi v16, v12, 8
+; RV32ZVBC32-NEXT:    vmul.vv v16, v10, v16
+; RV32ZVBC32-NEXT:    vxor.vv v14, v14, v16
+; RV32ZVBC32-NEXT:    vand.vx v16, v12, a2
+; RV32ZVBC32-NEXT:    li a2, 128
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    vxor.vv v8, v14, v8
+; RV32ZVBC32-NEXT:    vand.vx v14, v12, a1
+; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v12, a2
+; RV32ZVBC32-NEXT:    li a2, 256
+; RV32ZVBC32-NEXT:    vmul.vv v16, v10, v16
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    vand.vx v16, v12, a2
+; RV32ZVBC32-NEXT:    li a2, 512
+; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v12, a2
+; RV32ZVBC32-NEXT:    li a2, 1024
+; RV32ZVBC32-NEXT:    vmul.vv v16, v10, v16
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    vand.vx v16, v12, a2
+; RV32ZVBC32-NEXT:    lui a2, 1
+; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v12, a2
+; RV32ZVBC32-NEXT:    li a2, 1
+; RV32ZVBC32-NEXT:    slli a2, a2, 11
+; RV32ZVBC32-NEXT:    vmul.vv v16, v10, v16
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    vand.vx v16, v12, a2
+; RV32ZVBC32-NEXT:    lui a2, 2
+; RV32ZVBC32-NEXT:    vmul.vv v16, v10, v16
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    vand.vx v16, v12, a2
+; RV32ZVBC32-NEXT:    lui a2, 4
+; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v12, a2
+; RV32ZVBC32-NEXT:    lui a2, 8
+; RV32ZVBC32-NEXT:    vmul.vv v16, v10, v16
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    vand.vx v16, v12, a2
+; RV32ZVBC32-NEXT:    lui a2, 16
+; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v12, a2
+; RV32ZVBC32-NEXT:    lui a2, 32
+; RV32ZVBC32-NEXT:    vmul.vv v16, v10, v16
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    vand.vx v16, v12, a2
+; RV32ZVBC32-NEXT:    lui a2, 64
+; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v12, a2
+; RV32ZVBC32-NEXT:    lui a2, 128
+; RV32ZVBC32-NEXT:    vmul.vv v16, v10, v16
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    vand.vx v16, v12, a2
+; RV32ZVBC32-NEXT:    lui a2, 256
+; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v12, a2
+; RV32ZVBC32-NEXT:    lui a2, 512
+; RV32ZVBC32-NEXT:    vmul.vv v16, v10, v16
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    vand.vx v16, v12, a2
+; RV32ZVBC32-NEXT:    lui a2, 1024
+; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v12, a2
+; RV32ZVBC32-NEXT:    lui a2, 2048
+; RV32ZVBC32-NEXT:    vmul.vv v16, v10, v16
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    vand.vx v16, v12, a2
+; RV32ZVBC32-NEXT:    lui a2, 4096
+; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v12, a2
+; RV32ZVBC32-NEXT:    lui a2, 8192
+; RV32ZVBC32-NEXT:    vmul.vv v16, v10, v16
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    vand.vx v16, v12, a2
+; RV32ZVBC32-NEXT:    lui a2, 16384
+; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v12, a2
+; RV32ZVBC32-NEXT:    lui a2, 32768
+; RV32ZVBC32-NEXT:    vmul.vv v16, v10, v16
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    vand.vx v16, v12, a2
+; RV32ZVBC32-NEXT:    lui a2, 65536
+; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v12, a2
+; RV32ZVBC32-NEXT:    lui a2, 131072
+; RV32ZVBC32-NEXT:    vmul.vv v16, v10, v16
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    vand.vx v16, v12, a2
+; RV32ZVBC32-NEXT:    addi a2, sp, 8
+; RV32ZVBC32-NEXT:    sw a0, 8(sp)
+; RV32ZVBC32-NEXT:    sw zero, 12(sp)
+; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV32ZVBC32-NEXT:    vlse64.v v14, (a2), zero
+; RV32ZVBC32-NEXT:    vmul.vv v16, v10, v16
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    lui a0, 262144
+; RV32ZVBC32-NEXT:    vand.vv v14, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v12, v12, a0
+; RV32ZVBC32-NEXT:    vmul.vv v12, v10, v12
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v12
+; RV32ZVBC32-NEXT:    vmul.vv v10, v10, v14
+; RV32ZVBC32-NEXT:    vxor.vv v10, v8, v10
+; RV32ZVBC32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV32ZVBC32-NEXT:    vnsrl.wx v8, v10, a1
+; RV32ZVBC32-NEXT:    addi sp, sp, 16
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmulh_nxv2i32_vv:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; RV64ZVBC32-NEXT:    vzext.vf2 v10, v8
+; RV64ZVBC32-NEXT:    vzext.vf2 v12, v9
+; RV64ZVBC32-NEXT:    li a1, 16
+; RV64ZVBC32-NEXT:    li a0, 32
+; RV64ZVBC32-NEXT:    li a2, 64
+; RV64ZVBC32-NEXT:    vand.vi v8, v12, 2
+; RV64ZVBC32-NEXT:    vand.vi v14, v12, 1
+; RV64ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v8, v14, v8
+; RV64ZVBC32-NEXT:    vand.vi v14, v12, 4
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV64ZVBC32-NEXT:    vand.vi v14, v12, 8
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v12, a1
+; RV64ZVBC32-NEXT:    li a1, 128
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v12, a0
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v12, a2
+; RV64ZVBC32-NEXT:    li a2, 256
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v12, a1
+; RV64ZVBC32-NEXT:    li a1, 512
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v12, a2
+; RV64ZVBC32-NEXT:    li a2, 1024
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v12, a1
+; RV64ZVBC32-NEXT:    li a1, 1
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v12, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 11
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v12, a2
+; RV64ZVBC32-NEXT:    lui a2, 1
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v12, a2
+; RV64ZVBC32-NEXT:    lui a2, 2
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v12, a2
+; RV64ZVBC32-NEXT:    lui a2, 4
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v12, a2
+; RV64ZVBC32-NEXT:    lui a2, 8
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v12, a2
+; RV64ZVBC32-NEXT:    lui a2, 16
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v12, a2
+; RV64ZVBC32-NEXT:    lui a2, 32
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v12, a2
+; RV64ZVBC32-NEXT:    lui a2, 64
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v12, a2
+; RV64ZVBC32-NEXT:    lui a2, 128
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v12, a2
+; RV64ZVBC32-NEXT:    lui a2, 256
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v12, a2
+; RV64ZVBC32-NEXT:    lui a2, 512
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v12, a2
+; RV64ZVBC32-NEXT:    lui a2, 1024
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v12, a2
+; RV64ZVBC32-NEXT:    lui a2, 2048
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v12, a2
+; RV64ZVBC32-NEXT:    lui a2, 4096
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v12, a2
+; RV64ZVBC32-NEXT:    lui a2, 8192
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v12, a2
+; RV64ZVBC32-NEXT:    lui a2, 16384
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v12, a2
+; RV64ZVBC32-NEXT:    lui a2, 32768
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v12, a2
+; RV64ZVBC32-NEXT:    lui a2, 65536
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v12, a2
+; RV64ZVBC32-NEXT:    lui a2, 131072
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v12, a2
+; RV64ZVBC32-NEXT:    lui a2, 262144
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v12, a2
+; RV64ZVBC32-NEXT:    slli a1, a1, 31
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vand.vx v12, v12, a1
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV64ZVBC32-NEXT:    vmul.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vxor.vv v10, v8, v10
+; RV64ZVBC32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV64ZVBC32-NEXT:    vnsrl.wx v8, v10, a0
+; RV64ZVBC32-NEXT:    ret
   %va.ext = zext <vscale x 2 x i32> %va to <vscale x 2 x i64>
   %vb.ext = zext <vscale x 2 x i32> %vb to <vscale x 2 x i64>
   %clmul = call <vscale x 2 x i64> @llvm.clmul.nxv2i64(<vscale x 2 x i64> %va.ext, <vscale x 2 x i64> %vb.ext)
@@ -5559,31 +8093,307 @@ define <vscale x 2 x i32> @clmulh_nxv2i32_vx(<vscale x 2 x i32> %va, i32 %b) nou
 ; RV64V-NEXT:    vnsrl.wx v8, v10, a0
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmulh_nxv2i32_vx:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV32ZVBC-NEXT:    vmv.v.x v12, a0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; RV32ZVBC-NEXT:    vzext.vf2 v10, v8
-; RV32ZVBC-NEXT:    vzext.vf2 v8, v12
-; RV32ZVBC-NEXT:    vclmul.vv v10, v10, v8
-; RV32ZVBC-NEXT:    li a0, 32
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wx v8, v10, a0
-; RV32ZVBC-NEXT:    ret
+; RV32ZVBC64-LABEL: clmulh_nxv2i32_vx:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; RV32ZVBC64-NEXT:    vmv.v.x v12, a0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf2 v10, v8
+; RV32ZVBC64-NEXT:    vzext.vf2 v8, v12
+; RV32ZVBC64-NEXT:    vclmul.vv v10, v10, v8
+; RV32ZVBC64-NEXT:    li a0, 32
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wx v8, v10, a0
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmulh_nxv2i32_vx:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; RV64ZVBC64-NEXT:    vmv.v.x v12, a0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf2 v10, v8
+; RV64ZVBC64-NEXT:    vzext.vf2 v8, v12
+; RV64ZVBC64-NEXT:    vclmul.vv v10, v10, v8
+; RV64ZVBC64-NEXT:    li a0, 32
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wx v8, v10, a0
+; RV64ZVBC64-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmulh_nxv2i32_vx:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV64ZVBC-NEXT:    vmv.v.x v12, a0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; RV64ZVBC-NEXT:    vzext.vf2 v10, v8
-; RV64ZVBC-NEXT:    vzext.vf2 v8, v12
-; RV64ZVBC-NEXT:    vclmul.vv v10, v10, v8
-; RV64ZVBC-NEXT:    li a0, 32
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wx v8, v10, a0
-; RV64ZVBC-NEXT:    ret
+; RV32ZVBC32-LABEL: clmulh_nxv2i32_vx:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    addi sp, sp, -16
+; RV32ZVBC32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; RV32ZVBC32-NEXT:    vmv.v.x v12, a0
+; RV32ZVBC32-NEXT:    lui a0, 524288
+; RV32ZVBC32-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; RV32ZVBC32-NEXT:    vzext.vf2 v10, v8
+; RV32ZVBC32-NEXT:    li a2, 16
+; RV32ZVBC32-NEXT:    li a1, 32
+; RV32ZVBC32-NEXT:    vzext.vf2 v8, v12
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, a2
+; RV32ZVBC32-NEXT:    li a2, 64
+; RV32ZVBC32-NEXT:    vand.vi v14, v8, 2
+; RV32ZVBC32-NEXT:    vand.vi v16, v8, 1
+; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV32ZVBC32-NEXT:    vmul.vv v16, v10, v16
+; RV32ZVBC32-NEXT:    vxor.vv v14, v16, v14
+; RV32ZVBC32-NEXT:    vand.vi v16, v8, 4
+; RV32ZVBC32-NEXT:    vmul.vv v16, v10, v16
+; RV32ZVBC32-NEXT:    vxor.vv v14, v14, v16
+; RV32ZVBC32-NEXT:    vand.vi v16, v8, 8
+; RV32ZVBC32-NEXT:    vmul.vv v16, v10, v16
+; RV32ZVBC32-NEXT:    vxor.vv v14, v14, v16
+; RV32ZVBC32-NEXT:    vand.vx v16, v8, a2
+; RV32ZVBC32-NEXT:    li a2, 128
+; RV32ZVBC32-NEXT:    vmul.vv v12, v10, v12
+; RV32ZVBC32-NEXT:    vxor.vv v12, v14, v12
+; RV32ZVBC32-NEXT:    vand.vx v14, v8, a1
+; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v8, a2
+; RV32ZVBC32-NEXT:    li a2, 256
+; RV32ZVBC32-NEXT:    vmul.vv v16, v10, v16
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    vand.vx v16, v8, a2
+; RV32ZVBC32-NEXT:    li a2, 512
+; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v8, a2
+; RV32ZVBC32-NEXT:    li a2, 1024
+; RV32ZVBC32-NEXT:    vmul.vv v16, v10, v16
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    vand.vx v16, v8, a2
+; RV32ZVBC32-NEXT:    li a2, 1
+; RV32ZVBC32-NEXT:    slli a2, a2, 11
+; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 1
+; RV32ZVBC32-NEXT:    vmul.vv v16, v10, v16
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    vand.vx v16, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 2
+; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 4
+; RV32ZVBC32-NEXT:    vmul.vv v16, v10, v16
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    vand.vx v16, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 8
+; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 16
+; RV32ZVBC32-NEXT:    vmul.vv v16, v10, v16
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    vand.vx v16, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 32
+; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 64
+; RV32ZVBC32-NEXT:    vmul.vv v16, v10, v16
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    vand.vx v16, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 128
+; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 256
+; RV32ZVBC32-NEXT:    vmul.vv v16, v10, v16
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    vand.vx v16, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 512
+; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 1024
+; RV32ZVBC32-NEXT:    vmul.vv v16, v10, v16
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    vand.vx v16, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 2048
+; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 4096
+; RV32ZVBC32-NEXT:    vmul.vv v16, v10, v16
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    vand.vx v16, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 8192
+; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 16384
+; RV32ZVBC32-NEXT:    vmul.vv v16, v10, v16
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    vand.vx v16, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 32768
+; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 65536
+; RV32ZVBC32-NEXT:    vmul.vv v16, v10, v16
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    vand.vx v16, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 131072
+; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    vand.vx v14, v8, a2
+; RV32ZVBC32-NEXT:    addi a2, sp, 8
+; RV32ZVBC32-NEXT:    sw a0, 8(sp)
+; RV32ZVBC32-NEXT:    sw zero, 12(sp)
+; RV32ZVBC32-NEXT:    vmul.vv v16, v10, v16
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    vlse64.v v16, (a2), zero
+; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    lui a0, 262144
+; RV32ZVBC32-NEXT:    vand.vv v14, v8, v16
+; RV32ZVBC32-NEXT:    vand.vx v8, v8, a0
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    vxor.vv v8, v12, v8
+; RV32ZVBC32-NEXT:    vmul.vv v10, v10, v14
+; RV32ZVBC32-NEXT:    vxor.vv v10, v8, v10
+; RV32ZVBC32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV32ZVBC32-NEXT:    vnsrl.wx v8, v10, a1
+; RV32ZVBC32-NEXT:    addi sp, sp, 16
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmulh_nxv2i32_vx:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; RV64ZVBC32-NEXT:    vmv.v.x v12, a0
+; RV64ZVBC32-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; RV64ZVBC32-NEXT:    vzext.vf2 v10, v8
+; RV64ZVBC32-NEXT:    li a1, 16
+; RV64ZVBC32-NEXT:    li a0, 32
+; RV64ZVBC32-NEXT:    li a2, 64
+; RV64ZVBC32-NEXT:    vzext.vf2 v8, v12
+; RV64ZVBC32-NEXT:    vand.vi v12, v8, 2
+; RV64ZVBC32-NEXT:    vand.vi v14, v8, 1
+; RV64ZVBC32-NEXT:    vmul.vv v12, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v14, v12
+; RV64ZVBC32-NEXT:    vand.vi v14, v8, 4
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vi v14, v8, 8
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v8, a1
+; RV64ZVBC32-NEXT:    li a1, 128
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v8, a0
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v8, a2
+; RV64ZVBC32-NEXT:    li a2, 256
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v8, a1
+; RV64ZVBC32-NEXT:    li a1, 512
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v8, a2
+; RV64ZVBC32-NEXT:    li a2, 1024
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v8, a1
+; RV64ZVBC32-NEXT:    li a1, 1
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 11
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 1
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 2
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 4
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 8
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 16
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 32
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 64
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 128
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 256
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 512
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 1024
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 2048
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 4096
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 8192
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 16384
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 32768
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 65536
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 131072
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 262144
+; RV64ZVBC32-NEXT:    slli a1, a1, 31
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v8, a2
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, a1
+; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    vxor.vv v10, v12, v8
+; RV64ZVBC32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV64ZVBC32-NEXT:    vnsrl.wx v8, v10, a0
+; RV64ZVBC32-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x i32> poison, i32 %b, i64 0
   %vb = shufflevector <vscale x 2 x i32> %elt.head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
   %va.ext = zext <vscale x 2 x i32> %va to <vscale x 2 x i64>
@@ -5867,27 +8677,299 @@ define <vscale x 4 x i32> @clmulh_nxv4i32_vv(<vscale x 4 x i32> %va, <vscale x 4
 ; RV64V-NEXT:    vnsrl.wx v8, v12, a0
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmulh_nxv4i32_vv:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV32ZVBC-NEXT:    vzext.vf2 v12, v10
-; RV32ZVBC-NEXT:    vzext.vf2 v16, v8
-; RV32ZVBC-NEXT:    vclmul.vv v12, v16, v12
-; RV32ZVBC-NEXT:    li a0, 32
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wx v8, v12, a0
-; RV32ZVBC-NEXT:    ret
+; RV32ZVBC64-LABEL: clmulh_nxv4i32_vv:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf2 v12, v10
+; RV32ZVBC64-NEXT:    vzext.vf2 v16, v8
+; RV32ZVBC64-NEXT:    vclmul.vv v12, v16, v12
+; RV32ZVBC64-NEXT:    li a0, 32
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wx v8, v12, a0
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmulh_nxv4i32_vv:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf2 v12, v10
+; RV64ZVBC64-NEXT:    vzext.vf2 v16, v8
+; RV64ZVBC64-NEXT:    vclmul.vv v12, v16, v12
+; RV64ZVBC64-NEXT:    li a0, 32
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wx v8, v12, a0
+; RV64ZVBC64-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmulh_nxv4i32_vv:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV64ZVBC-NEXT:    vzext.vf2 v12, v10
-; RV64ZVBC-NEXT:    vzext.vf2 v16, v8
-; RV64ZVBC-NEXT:    vclmul.vv v12, v16, v12
-; RV64ZVBC-NEXT:    li a0, 32
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wx v8, v12, a0
-; RV64ZVBC-NEXT:    ret
+; RV32ZVBC32-LABEL: clmulh_nxv4i32_vv:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    addi sp, sp, -16
+; RV32ZVBC32-NEXT:    lui a0, 524288
+; RV32ZVBC32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32ZVBC32-NEXT:    vzext.vf2 v12, v8
+; RV32ZVBC32-NEXT:    vzext.vf2 v16, v10
+; RV32ZVBC32-NEXT:    li a2, 16
+; RV32ZVBC32-NEXT:    li a1, 32
+; RV32ZVBC32-NEXT:    vand.vx v8, v16, a2
+; RV32ZVBC32-NEXT:    li a2, 64
+; RV32ZVBC32-NEXT:    vand.vi v20, v16, 2
+; RV32ZVBC32-NEXT:    vand.vi v24, v16, 1
+; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV32ZVBC32-NEXT:    vmul.vv v24, v12, v24
+; RV32ZVBC32-NEXT:    vxor.vv v20, v24, v20
+; RV32ZVBC32-NEXT:    vand.vi v24, v16, 4
+; RV32ZVBC32-NEXT:    vmul.vv v24, v12, v24
+; RV32ZVBC32-NEXT:    vxor.vv v20, v20, v24
+; RV32ZVBC32-NEXT:    vand.vi v24, v16, 8
+; RV32ZVBC32-NEXT:    vmul.vv v24, v12, v24
+; RV32ZVBC32-NEXT:    vxor.vv v20, v20, v24
+; RV32ZVBC32-NEXT:    vand.vx v24, v16, a2
+; RV32ZVBC32-NEXT:    li a2, 128
+; RV32ZVBC32-NEXT:    vmul.vv v8, v12, v8
+; RV32ZVBC32-NEXT:    vxor.vv v8, v20, v8
+; RV32ZVBC32-NEXT:    vand.vx v20, v16, a1
+; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v16, a2
+; RV32ZVBC32-NEXT:    li a2, 256
+; RV32ZVBC32-NEXT:    vmul.vv v24, v12, v24
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v24
+; RV32ZVBC32-NEXT:    vand.vx v24, v16, a2
+; RV32ZVBC32-NEXT:    li a2, 512
+; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v16, a2
+; RV32ZVBC32-NEXT:    li a2, 1024
+; RV32ZVBC32-NEXT:    vmul.vv v24, v12, v24
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v24
+; RV32ZVBC32-NEXT:    vand.vx v24, v16, a2
+; RV32ZVBC32-NEXT:    lui a2, 1
+; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v16, a2
+; RV32ZVBC32-NEXT:    li a2, 1
+; RV32ZVBC32-NEXT:    slli a2, a2, 11
+; RV32ZVBC32-NEXT:    vmul.vv v24, v12, v24
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v24
+; RV32ZVBC32-NEXT:    vand.vx v24, v16, a2
+; RV32ZVBC32-NEXT:    lui a2, 2
+; RV32ZVBC32-NEXT:    vmul.vv v24, v12, v24
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v24
+; RV32ZVBC32-NEXT:    vand.vx v24, v16, a2
+; RV32ZVBC32-NEXT:    lui a2, 4
+; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v16, a2
+; RV32ZVBC32-NEXT:    lui a2, 8
+; RV32ZVBC32-NEXT:    vmul.vv v24, v12, v24
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v24
+; RV32ZVBC32-NEXT:    vand.vx v24, v16, a2
+; RV32ZVBC32-NEXT:    lui a2, 16
+; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v16, a2
+; RV32ZVBC32-NEXT:    lui a2, 32
+; RV32ZVBC32-NEXT:    vmul.vv v24, v12, v24
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v24
+; RV32ZVBC32-NEXT:    vand.vx v24, v16, a2
+; RV32ZVBC32-NEXT:    lui a2, 64
+; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v16, a2
+; RV32ZVBC32-NEXT:    lui a2, 128
+; RV32ZVBC32-NEXT:    vmul.vv v24, v12, v24
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v24
+; RV32ZVBC32-NEXT:    vand.vx v24, v16, a2
+; RV32ZVBC32-NEXT:    lui a2, 256
+; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v16, a2
+; RV32ZVBC32-NEXT:    lui a2, 512
+; RV32ZVBC32-NEXT:    vmul.vv v24, v12, v24
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v24
+; RV32ZVBC32-NEXT:    vand.vx v24, v16, a2
+; RV32ZVBC32-NEXT:    lui a2, 1024
+; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v16, a2
+; RV32ZVBC32-NEXT:    lui a2, 2048
+; RV32ZVBC32-NEXT:    vmul.vv v24, v12, v24
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v24
+; RV32ZVBC32-NEXT:    vand.vx v24, v16, a2
+; RV32ZVBC32-NEXT:    lui a2, 4096
+; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v16, a2
+; RV32ZVBC32-NEXT:    lui a2, 8192
+; RV32ZVBC32-NEXT:    vmul.vv v24, v12, v24
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v24
+; RV32ZVBC32-NEXT:    vand.vx v24, v16, a2
+; RV32ZVBC32-NEXT:    lui a2, 16384
+; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v16, a2
+; RV32ZVBC32-NEXT:    lui a2, 32768
+; RV32ZVBC32-NEXT:    vmul.vv v24, v12, v24
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v24
+; RV32ZVBC32-NEXT:    vand.vx v24, v16, a2
+; RV32ZVBC32-NEXT:    lui a2, 65536
+; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v16, a2
+; RV32ZVBC32-NEXT:    lui a2, 131072
+; RV32ZVBC32-NEXT:    vmul.vv v24, v12, v24
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v24
+; RV32ZVBC32-NEXT:    vand.vx v24, v16, a2
+; RV32ZVBC32-NEXT:    addi a2, sp, 8
+; RV32ZVBC32-NEXT:    sw a0, 8(sp)
+; RV32ZVBC32-NEXT:    sw zero, 12(sp)
+; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v20
+; RV32ZVBC32-NEXT:    vlse64.v v20, (a2), zero
+; RV32ZVBC32-NEXT:    vmul.vv v24, v12, v24
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v24
+; RV32ZVBC32-NEXT:    lui a0, 262144
+; RV32ZVBC32-NEXT:    vand.vv v20, v16, v20
+; RV32ZVBC32-NEXT:    vand.vx v16, v16, a0
+; RV32ZVBC32-NEXT:    vmul.vv v16, v12, v16
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    vmul.vv v12, v12, v20
+; RV32ZVBC32-NEXT:    vxor.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVBC32-NEXT:    vnsrl.wx v8, v12, a1
+; RV32ZVBC32-NEXT:    addi sp, sp, 16
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmulh_nxv4i32_vv:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV64ZVBC32-NEXT:    vzext.vf2 v12, v8
+; RV64ZVBC32-NEXT:    vzext.vf2 v16, v10
+; RV64ZVBC32-NEXT:    li a1, 16
+; RV64ZVBC32-NEXT:    li a0, 32
+; RV64ZVBC32-NEXT:    li a2, 64
+; RV64ZVBC32-NEXT:    vand.vi v8, v16, 2
+; RV64ZVBC32-NEXT:    vand.vi v20, v16, 1
+; RV64ZVBC32-NEXT:    vmul.vv v8, v12, v8
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v8, v20, v8
+; RV64ZVBC32-NEXT:    vand.vi v20, v16, 4
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
+; RV64ZVBC32-NEXT:    vand.vi v20, v16, 8
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v16, a1
+; RV64ZVBC32-NEXT:    li a1, 128
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v16, a0
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v16, a2
+; RV64ZVBC32-NEXT:    li a2, 256
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v16, a1
+; RV64ZVBC32-NEXT:    li a1, 512
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v16, a2
+; RV64ZVBC32-NEXT:    li a2, 1024
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v16, a1
+; RV64ZVBC32-NEXT:    li a1, 1
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v16, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 11
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v16, a2
+; RV64ZVBC32-NEXT:    lui a2, 1
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v16, a2
+; RV64ZVBC32-NEXT:    lui a2, 2
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v16, a2
+; RV64ZVBC32-NEXT:    lui a2, 4
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v16, a2
+; RV64ZVBC32-NEXT:    lui a2, 8
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v16, a2
+; RV64ZVBC32-NEXT:    lui a2, 16
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v16, a2
+; RV64ZVBC32-NEXT:    lui a2, 32
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v16, a2
+; RV64ZVBC32-NEXT:    lui a2, 64
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v16, a2
+; RV64ZVBC32-NEXT:    lui a2, 128
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v16, a2
+; RV64ZVBC32-NEXT:    lui a2, 256
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v16, a2
+; RV64ZVBC32-NEXT:    lui a2, 512
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v16, a2
+; RV64ZVBC32-NEXT:    lui a2, 1024
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v16, a2
+; RV64ZVBC32-NEXT:    lui a2, 2048
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v16, a2
+; RV64ZVBC32-NEXT:    lui a2, 4096
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v16, a2
+; RV64ZVBC32-NEXT:    lui a2, 8192
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v16, a2
+; RV64ZVBC32-NEXT:    lui a2, 16384
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v16, a2
+; RV64ZVBC32-NEXT:    lui a2, 32768
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v16, a2
+; RV64ZVBC32-NEXT:    lui a2, 65536
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v16, a2
+; RV64ZVBC32-NEXT:    lui a2, 131072
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v16, a2
+; RV64ZVBC32-NEXT:    lui a2, 262144
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v16, a2
+; RV64ZVBC32-NEXT:    slli a1, a1, 31
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vand.vx v16, v16, a1
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
+; RV64ZVBC32-NEXT:    vmul.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    vxor.vv v12, v8, v12
+; RV64ZVBC32-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV64ZVBC32-NEXT:    vnsrl.wx v8, v12, a0
+; RV64ZVBC32-NEXT:    ret
   %va.ext = zext <vscale x 4 x i32> %va to <vscale x 4 x i64>
   %vb.ext = zext <vscale x 4 x i32> %vb to <vscale x 4 x i64>
   %clmul = call <vscale x 4 x i64> @llvm.clmul.nxv4i64(<vscale x 4 x i64> %va.ext, <vscale x 4 x i64> %vb.ext)
@@ -6173,31 +9255,307 @@ define <vscale x 4 x i32> @clmulh_nxv4i32_vx(<vscale x 4 x i32> %va, i32 %b) nou
 ; RV64V-NEXT:    vnsrl.wx v8, v12, a0
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmulh_nxv4i32_vx:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV32ZVBC-NEXT:    vmv.v.x v16, a0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
-; RV32ZVBC-NEXT:    vzext.vf2 v12, v8
-; RV32ZVBC-NEXT:    vzext.vf2 v8, v16
-; RV32ZVBC-NEXT:    vclmul.vv v12, v12, v8
-; RV32ZVBC-NEXT:    li a0, 32
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wx v8, v12, a0
-; RV32ZVBC-NEXT:    ret
+; RV32ZVBC64-LABEL: clmulh_nxv4i32_vx:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; RV32ZVBC64-NEXT:    vmv.v.x v16, a0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf2 v12, v8
+; RV32ZVBC64-NEXT:    vzext.vf2 v8, v16
+; RV32ZVBC64-NEXT:    vclmul.vv v12, v12, v8
+; RV32ZVBC64-NEXT:    li a0, 32
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wx v8, v12, a0
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmulh_nxv4i32_vx:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; RV64ZVBC64-NEXT:    vmv.v.x v16, a0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf2 v12, v8
+; RV64ZVBC64-NEXT:    vzext.vf2 v8, v16
+; RV64ZVBC64-NEXT:    vclmul.vv v12, v12, v8
+; RV64ZVBC64-NEXT:    li a0, 32
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wx v8, v12, a0
+; RV64ZVBC64-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmulh_nxv4i32_vx:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV64ZVBC-NEXT:    vmv.v.x v16, a0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
-; RV64ZVBC-NEXT:    vzext.vf2 v12, v8
-; RV64ZVBC-NEXT:    vzext.vf2 v8, v16
-; RV64ZVBC-NEXT:    vclmul.vv v12, v12, v8
-; RV64ZVBC-NEXT:    li a0, 32
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wx v8, v12, a0
-; RV64ZVBC-NEXT:    ret
+; RV32ZVBC32-LABEL: clmulh_nxv4i32_vx:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    addi sp, sp, -16
+; RV32ZVBC32-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; RV32ZVBC32-NEXT:    vmv.v.x v16, a0
+; RV32ZVBC32-NEXT:    lui a0, 524288
+; RV32ZVBC32-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; RV32ZVBC32-NEXT:    vzext.vf2 v12, v8
+; RV32ZVBC32-NEXT:    li a2, 16
+; RV32ZVBC32-NEXT:    li a1, 32
+; RV32ZVBC32-NEXT:    vzext.vf2 v8, v16
+; RV32ZVBC32-NEXT:    vand.vx v16, v8, a2
+; RV32ZVBC32-NEXT:    li a2, 64
+; RV32ZVBC32-NEXT:    vand.vi v20, v8, 2
+; RV32ZVBC32-NEXT:    vand.vi v24, v8, 1
+; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV32ZVBC32-NEXT:    vmul.vv v24, v12, v24
+; RV32ZVBC32-NEXT:    vxor.vv v20, v24, v20
+; RV32ZVBC32-NEXT:    vand.vi v24, v8, 4
+; RV32ZVBC32-NEXT:    vmul.vv v24, v12, v24
+; RV32ZVBC32-NEXT:    vxor.vv v20, v20, v24
+; RV32ZVBC32-NEXT:    vand.vi v24, v8, 8
+; RV32ZVBC32-NEXT:    vmul.vv v24, v12, v24
+; RV32ZVBC32-NEXT:    vxor.vv v20, v20, v24
+; RV32ZVBC32-NEXT:    vand.vx v24, v8, a2
+; RV32ZVBC32-NEXT:    li a2, 128
+; RV32ZVBC32-NEXT:    vmul.vv v16, v12, v16
+; RV32ZVBC32-NEXT:    vxor.vv v16, v20, v16
+; RV32ZVBC32-NEXT:    vand.vx v20, v8, a1
+; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v8, a2
+; RV32ZVBC32-NEXT:    li a2, 256
+; RV32ZVBC32-NEXT:    vmul.vv v24, v12, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    vand.vx v24, v8, a2
+; RV32ZVBC32-NEXT:    li a2, 512
+; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v8, a2
+; RV32ZVBC32-NEXT:    li a2, 1024
+; RV32ZVBC32-NEXT:    vmul.vv v24, v12, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    vand.vx v24, v8, a2
+; RV32ZVBC32-NEXT:    li a2, 1
+; RV32ZVBC32-NEXT:    slli a2, a2, 11
+; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 1
+; RV32ZVBC32-NEXT:    vmul.vv v24, v12, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    vand.vx v24, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 2
+; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 4
+; RV32ZVBC32-NEXT:    vmul.vv v24, v12, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    vand.vx v24, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 8
+; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 16
+; RV32ZVBC32-NEXT:    vmul.vv v24, v12, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    vand.vx v24, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 32
+; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 64
+; RV32ZVBC32-NEXT:    vmul.vv v24, v12, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    vand.vx v24, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 128
+; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 256
+; RV32ZVBC32-NEXT:    vmul.vv v24, v12, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    vand.vx v24, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 512
+; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 1024
+; RV32ZVBC32-NEXT:    vmul.vv v24, v12, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    vand.vx v24, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 2048
+; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 4096
+; RV32ZVBC32-NEXT:    vmul.vv v24, v12, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    vand.vx v24, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 8192
+; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 16384
+; RV32ZVBC32-NEXT:    vmul.vv v24, v12, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    vand.vx v24, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 32768
+; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 65536
+; RV32ZVBC32-NEXT:    vmul.vv v24, v12, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    vand.vx v24, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 131072
+; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    vand.vx v20, v8, a2
+; RV32ZVBC32-NEXT:    addi a2, sp, 8
+; RV32ZVBC32-NEXT:    sw a0, 8(sp)
+; RV32ZVBC32-NEXT:    sw zero, 12(sp)
+; RV32ZVBC32-NEXT:    vmul.vv v24, v12, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    vlse64.v v24, (a2), zero
+; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    lui a0, 262144
+; RV32ZVBC32-NEXT:    vand.vv v20, v8, v24
+; RV32ZVBC32-NEXT:    vand.vx v8, v8, a0
+; RV32ZVBC32-NEXT:    vmul.vv v8, v12, v8
+; RV32ZVBC32-NEXT:    vxor.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    vmul.vv v12, v12, v20
+; RV32ZVBC32-NEXT:    vxor.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVBC32-NEXT:    vnsrl.wx v8, v12, a1
+; RV32ZVBC32-NEXT:    addi sp, sp, 16
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmulh_nxv4i32_vx:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; RV64ZVBC32-NEXT:    vmv.v.x v16, a0
+; RV64ZVBC32-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; RV64ZVBC32-NEXT:    vzext.vf2 v12, v8
+; RV64ZVBC32-NEXT:    li a1, 16
+; RV64ZVBC32-NEXT:    li a0, 32
+; RV64ZVBC32-NEXT:    li a2, 64
+; RV64ZVBC32-NEXT:    vzext.vf2 v8, v16
+; RV64ZVBC32-NEXT:    vand.vi v16, v8, 2
+; RV64ZVBC32-NEXT:    vand.vi v20, v8, 1
+; RV64ZVBC32-NEXT:    vmul.vv v16, v12, v16
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v20, v16
+; RV64ZVBC32-NEXT:    vand.vi v20, v8, 4
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vi v20, v8, 8
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v8, a1
+; RV64ZVBC32-NEXT:    li a1, 128
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v8, a0
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v8, a2
+; RV64ZVBC32-NEXT:    li a2, 256
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v8, a1
+; RV64ZVBC32-NEXT:    li a1, 512
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v8, a2
+; RV64ZVBC32-NEXT:    li a2, 1024
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v8, a1
+; RV64ZVBC32-NEXT:    li a1, 1
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 11
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 1
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 2
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 4
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 8
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 16
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 32
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 64
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 128
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 256
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 512
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 1024
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 2048
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 4096
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 8192
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 16384
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 32768
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 65536
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 131072
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 262144
+; RV64ZVBC32-NEXT:    slli a1, a1, 31
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v8, a2
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, a1
+; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vmul.vv v8, v12, v8
+; RV64ZVBC32-NEXT:    vxor.vv v12, v16, v8
+; RV64ZVBC32-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV64ZVBC32-NEXT:    vnsrl.wx v8, v12, a0
+; RV64ZVBC32-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x i32> poison, i32 %b, i64 0
   %vb = shufflevector <vscale x 4 x i32> %elt.head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
   %va.ext = zext <vscale x 4 x i32> %va to <vscale x 4 x i64>
@@ -6501,27 +9859,319 @@ define <vscale x 8 x i32> @clmulh_nxv8i32_vv(<vscale x 8 x i32> %va, <vscale x 8
 ; RV64V-NEXT:    vnsrl.wx v8, v16, a0
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmulh_nxv8i32_vv:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV32ZVBC-NEXT:    vzext.vf2 v16, v12
-; RV32ZVBC-NEXT:    vzext.vf2 v24, v8
-; RV32ZVBC-NEXT:    vclmul.vv v16, v24, v16
-; RV32ZVBC-NEXT:    li a0, 32
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wx v8, v16, a0
-; RV32ZVBC-NEXT:    ret
+; RV32ZVBC64-LABEL: clmulh_nxv8i32_vv:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf2 v16, v12
+; RV32ZVBC64-NEXT:    vzext.vf2 v24, v8
+; RV32ZVBC64-NEXT:    vclmul.vv v16, v24, v16
+; RV32ZVBC64-NEXT:    li a0, 32
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wx v8, v16, a0
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmulh_nxv8i32_vv:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf2 v16, v12
+; RV64ZVBC64-NEXT:    vzext.vf2 v24, v8
+; RV64ZVBC64-NEXT:    vclmul.vv v16, v24, v16
+; RV64ZVBC64-NEXT:    li a0, 32
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wx v8, v16, a0
+; RV64ZVBC64-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmulh_nxv8i32_vv:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV64ZVBC-NEXT:    vzext.vf2 v16, v12
-; RV64ZVBC-NEXT:    vzext.vf2 v24, v8
-; RV64ZVBC-NEXT:    vclmul.vv v16, v24, v16
-; RV64ZVBC-NEXT:    li a0, 32
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wx v8, v16, a0
-; RV64ZVBC-NEXT:    ret
+; RV32ZVBC32-LABEL: clmulh_nxv8i32_vv:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    addi sp, sp, -16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    sub sp, sp, a0
+; RV32ZVBC32-NEXT:    lui a0, 524288
+; RV32ZVBC32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32ZVBC32-NEXT:    vzext.vf2 v16, v8
+; RV32ZVBC32-NEXT:    vzext.vf2 v24, v12
+; RV32ZVBC32-NEXT:    li a2, 16
+; RV32ZVBC32-NEXT:    li a1, 32
+; RV32ZVBC32-NEXT:    vand.vi v8, v24, 2
+; RV32ZVBC32-NEXT:    vand.vi v0, v24, 1
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC32-NEXT:    vxor.vv v8, v0, v8
+; RV32ZVBC32-NEXT:    vand.vi v0, v24, 4
+; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v0
+; RV32ZVBC32-NEXT:    vand.vi v0, v24, 8
+; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v24, a2
+; RV32ZVBC32-NEXT:    li a2, 64
+; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v24, a1
+; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v24, a2
+; RV32ZVBC32-NEXT:    li a2, 128
+; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v24, a2
+; RV32ZVBC32-NEXT:    li a2, 256
+; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v24, a2
+; RV32ZVBC32-NEXT:    li a2, 512
+; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v24, a2
+; RV32ZVBC32-NEXT:    li a2, 1024
+; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v24, a2
+; RV32ZVBC32-NEXT:    li a2, 1
+; RV32ZVBC32-NEXT:    slli a2, a2, 11
+; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v24, a2
+; RV32ZVBC32-NEXT:    lui a2, 1
+; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v24, a2
+; RV32ZVBC32-NEXT:    lui a2, 2
+; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v24, a2
+; RV32ZVBC32-NEXT:    lui a2, 4
+; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v24, a2
+; RV32ZVBC32-NEXT:    lui a2, 8
+; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v24, a2
+; RV32ZVBC32-NEXT:    lui a2, 16
+; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v24, a2
+; RV32ZVBC32-NEXT:    lui a2, 32
+; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v24, a2
+; RV32ZVBC32-NEXT:    lui a2, 64
+; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v24, a2
+; RV32ZVBC32-NEXT:    lui a2, 128
+; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v24, a2
+; RV32ZVBC32-NEXT:    lui a2, 256
+; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v24, a2
+; RV32ZVBC32-NEXT:    lui a2, 512
+; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v24, a2
+; RV32ZVBC32-NEXT:    lui a2, 1024
+; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v24, a2
+; RV32ZVBC32-NEXT:    lui a2, 2048
+; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v24, a2
+; RV32ZVBC32-NEXT:    lui a2, 4096
+; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v24, a2
+; RV32ZVBC32-NEXT:    lui a2, 8192
+; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v24, a2
+; RV32ZVBC32-NEXT:    lui a2, 16384
+; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v24, a2
+; RV32ZVBC32-NEXT:    lui a2, 32768
+; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v24, a2
+; RV32ZVBC32-NEXT:    lui a2, 65536
+; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v24, a2
+; RV32ZVBC32-NEXT:    lui a2, 131072
+; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v0
+; RV32ZVBC32-NEXT:    csrr a3, vlenb
+; RV32ZVBC32-NEXT:    slli a3, a3, 3
+; RV32ZVBC32-NEXT:    add a3, sp, a3
+; RV32ZVBC32-NEXT:    addi a3, a3, 16
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v0, v24, a2
+; RV32ZVBC32-NEXT:    addi a2, sp, 8
+; RV32ZVBC32-NEXT:    sw a0, 8(sp)
+; RV32ZVBC32-NEXT:    sw zero, 12(sp)
+; RV32ZVBC32-NEXT:    vlse64.v v8, (a2), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 16
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 16
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    lui a0, 262144
+; RV32ZVBC32-NEXT:    addi a2, sp, 16
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    vand.vx v24, v24, a0
+; RV32ZVBC32-NEXT:    vmul.vv v24, v16, v24
+; RV32ZVBC32-NEXT:    vxor.vv v24, v0, v24
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    vxor.vv v16, v24, v8
+; RV32ZVBC32-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV32ZVBC32-NEXT:    vnsrl.wx v8, v16, a1
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add sp, sp, a0
+; RV32ZVBC32-NEXT:    addi sp, sp, 16
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmulh_nxv8i32_vv:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV64ZVBC32-NEXT:    vzext.vf2 v16, v8
+; RV64ZVBC32-NEXT:    vzext.vf2 v24, v12
+; RV64ZVBC32-NEXT:    li a1, 16
+; RV64ZVBC32-NEXT:    li a0, 32
+; RV64ZVBC32-NEXT:    li a2, 64
+; RV64ZVBC32-NEXT:    vand.vi v8, v24, 2
+; RV64ZVBC32-NEXT:    vand.vi v0, v24, 1
+; RV64ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC32-NEXT:    vxor.vv v8, v0, v8
+; RV64ZVBC32-NEXT:    vand.vi v0, v24, 4
+; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v0
+; RV64ZVBC32-NEXT:    vand.vi v0, v24, 8
+; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v24, a1
+; RV64ZVBC32-NEXT:    li a1, 128
+; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v24, a0
+; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v24, a2
+; RV64ZVBC32-NEXT:    li a2, 256
+; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v24, a1
+; RV64ZVBC32-NEXT:    li a1, 512
+; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v24, a2
+; RV64ZVBC32-NEXT:    li a2, 1024
+; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v24, a1
+; RV64ZVBC32-NEXT:    li a1, 1
+; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v24, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 11
+; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v24, a2
+; RV64ZVBC32-NEXT:    lui a2, 1
+; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v24, a2
+; RV64ZVBC32-NEXT:    lui a2, 2
+; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v24, a2
+; RV64ZVBC32-NEXT:    lui a2, 4
+; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v24, a2
+; RV64ZVBC32-NEXT:    lui a2, 8
+; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v24, a2
+; RV64ZVBC32-NEXT:    lui a2, 16
+; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v24, a2
+; RV64ZVBC32-NEXT:    lui a2, 32
+; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v24, a2
+; RV64ZVBC32-NEXT:    lui a2, 64
+; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v24, a2
+; RV64ZVBC32-NEXT:    lui a2, 128
+; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v24, a2
+; RV64ZVBC32-NEXT:    lui a2, 256
+; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v24, a2
+; RV64ZVBC32-NEXT:    lui a2, 512
+; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v24, a2
+; RV64ZVBC32-NEXT:    lui a2, 1024
+; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v24, a2
+; RV64ZVBC32-NEXT:    lui a2, 2048
+; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v24, a2
+; RV64ZVBC32-NEXT:    lui a2, 4096
+; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v24, a2
+; RV64ZVBC32-NEXT:    lui a2, 8192
+; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v24, a2
+; RV64ZVBC32-NEXT:    lui a2, 16384
+; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v24, a2
+; RV64ZVBC32-NEXT:    lui a2, 32768
+; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v24, a2
+; RV64ZVBC32-NEXT:    lui a2, 65536
+; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v24, a2
+; RV64ZVBC32-NEXT:    lui a2, 131072
+; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v24, a2
+; RV64ZVBC32-NEXT:    lui a2, 262144
+; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v24, a2
+; RV64ZVBC32-NEXT:    slli a1, a1, 31
+; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC32-NEXT:    vand.vx v24, v24, a1
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v0
+; RV64ZVBC32-NEXT:    vmul.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vxor.vv v16, v8, v16
+; RV64ZVBC32-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV64ZVBC32-NEXT:    vnsrl.wx v8, v16, a0
+; RV64ZVBC32-NEXT:    ret
   %va.ext = zext <vscale x 8 x i32> %va to <vscale x 8 x i64>
   %vb.ext = zext <vscale x 8 x i32> %vb to <vscale x 8 x i64>
   %clmul = call <vscale x 8 x i64> @llvm.clmul.nxv8i64(<vscale x 8 x i64> %va.ext, <vscale x 8 x i64> %vb.ext)
@@ -6827,31 +10477,327 @@ define <vscale x 8 x i32> @clmulh_nxv8i32_vx(<vscale x 8 x i32> %va, i32 %b) nou
 ; RV64V-NEXT:    vnsrl.wx v8, v16, a0
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmulh_nxv8i32_vx:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV32ZVBC-NEXT:    vmv.v.x v24, a0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV32ZVBC-NEXT:    vzext.vf2 v16, v8
-; RV32ZVBC-NEXT:    vzext.vf2 v8, v24
-; RV32ZVBC-NEXT:    vclmul.vv v16, v16, v8
-; RV32ZVBC-NEXT:    li a0, 32
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wx v8, v16, a0
-; RV32ZVBC-NEXT:    ret
+; RV32ZVBC64-LABEL: clmulh_nxv8i32_vx:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
+; RV32ZVBC64-NEXT:    vmv.v.x v24, a0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf2 v16, v8
+; RV32ZVBC64-NEXT:    vzext.vf2 v8, v24
+; RV32ZVBC64-NEXT:    vclmul.vv v16, v16, v8
+; RV32ZVBC64-NEXT:    li a0, 32
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wx v8, v16, a0
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmulh_nxv8i32_vx:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
+; RV64ZVBC64-NEXT:    vmv.v.x v24, a0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf2 v16, v8
+; RV64ZVBC64-NEXT:    vzext.vf2 v8, v24
+; RV64ZVBC64-NEXT:    vclmul.vv v16, v16, v8
+; RV64ZVBC64-NEXT:    li a0, 32
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wx v8, v16, a0
+; RV64ZVBC64-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmulh_nxv8i32_vx:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV64ZVBC-NEXT:    vmv.v.x v24, a0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV64ZVBC-NEXT:    vzext.vf2 v16, v8
-; RV64ZVBC-NEXT:    vzext.vf2 v8, v24
-; RV64ZVBC-NEXT:    vclmul.vv v16, v16, v8
-; RV64ZVBC-NEXT:    li a0, 32
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wx v8, v16, a0
-; RV64ZVBC-NEXT:    ret
+; RV32ZVBC32-LABEL: clmulh_nxv8i32_vx:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    addi sp, sp, -16
+; RV32ZVBC32-NEXT:    csrr a1, vlenb
+; RV32ZVBC32-NEXT:    slli a1, a1, 4
+; RV32ZVBC32-NEXT:    sub sp, sp, a1
+; RV32ZVBC32-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
+; RV32ZVBC32-NEXT:    vmv.v.x v24, a0
+; RV32ZVBC32-NEXT:    lui a0, 524288
+; RV32ZVBC32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV32ZVBC32-NEXT:    vzext.vf2 v16, v8
+; RV32ZVBC32-NEXT:    li a2, 16
+; RV32ZVBC32-NEXT:    li a1, 32
+; RV32ZVBC32-NEXT:    vzext.vf2 v8, v24
+; RV32ZVBC32-NEXT:    vand.vi v24, v8, 2
+; RV32ZVBC32-NEXT:    vand.vi v0, v8, 1
+; RV32ZVBC32-NEXT:    vmul.vv v24, v16, v24
+; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v0, v24
+; RV32ZVBC32-NEXT:    vand.vi v0, v8, 4
+; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vi v0, v8, 8
+; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v8, a2
+; RV32ZVBC32-NEXT:    li a2, 64
+; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v8, a1
+; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v8, a2
+; RV32ZVBC32-NEXT:    li a2, 128
+; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v8, a2
+; RV32ZVBC32-NEXT:    li a2, 256
+; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v8, a2
+; RV32ZVBC32-NEXT:    li a2, 512
+; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v8, a2
+; RV32ZVBC32-NEXT:    li a2, 1024
+; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v8, a2
+; RV32ZVBC32-NEXT:    li a2, 1
+; RV32ZVBC32-NEXT:    slli a2, a2, 11
+; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 1
+; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 2
+; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 4
+; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 8
+; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 16
+; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 32
+; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 64
+; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 128
+; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 256
+; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 512
+; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 1024
+; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 2048
+; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 4096
+; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 8192
+; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 16384
+; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 32768
+; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 65536
+; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v8, a2
+; RV32ZVBC32-NEXT:    lui a2, 131072
+; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    csrr a3, vlenb
+; RV32ZVBC32-NEXT:    slli a3, a3, 3
+; RV32ZVBC32-NEXT:    add a3, sp, a3
+; RV32ZVBC32-NEXT:    addi a3, a3, 16
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v0, v8, a2
+; RV32ZVBC32-NEXT:    addi a2, sp, 8
+; RV32ZVBC32-NEXT:    sw a0, 8(sp)
+; RV32ZVBC32-NEXT:    sw zero, 12(sp)
+; RV32ZVBC32-NEXT:    vlse64.v v24, (a2), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 16
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 16
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v0, v24, v0
+; RV32ZVBC32-NEXT:    lui a0, 262144
+; RV32ZVBC32-NEXT:    addi a2, sp, 16
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    vand.vx v8, v8, a0
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    vxor.vv v8, v0, v8
+; RV32ZVBC32-NEXT:    vmul.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV32ZVBC32-NEXT:    vnsrl.wx v8, v16, a1
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add sp, sp, a0
+; RV32ZVBC32-NEXT:    addi sp, sp, 16
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmulh_nxv8i32_vx:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
+; RV64ZVBC32-NEXT:    vmv.v.x v24, a0
+; RV64ZVBC32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV64ZVBC32-NEXT:    vzext.vf2 v16, v8
+; RV64ZVBC32-NEXT:    li a1, 16
+; RV64ZVBC32-NEXT:    li a0, 32
+; RV64ZVBC32-NEXT:    li a2, 64
+; RV64ZVBC32-NEXT:    vzext.vf2 v8, v24
+; RV64ZVBC32-NEXT:    vand.vi v24, v8, 2
+; RV64ZVBC32-NEXT:    vand.vi v0, v8, 1
+; RV64ZVBC32-NEXT:    vmul.vv v24, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v0, v24
+; RV64ZVBC32-NEXT:    vand.vi v0, v8, 4
+; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vi v0, v8, 8
+; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v8, a1
+; RV64ZVBC32-NEXT:    li a1, 128
+; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v8, a0
+; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v8, a2
+; RV64ZVBC32-NEXT:    li a2, 256
+; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v8, a1
+; RV64ZVBC32-NEXT:    li a1, 512
+; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v8, a2
+; RV64ZVBC32-NEXT:    li a2, 1024
+; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v8, a1
+; RV64ZVBC32-NEXT:    li a1, 1
+; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v8, a2
+; RV64ZVBC32-NEXT:    slli a2, a1, 11
+; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 1
+; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 2
+; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 4
+; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 8
+; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 16
+; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 32
+; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 64
+; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 128
+; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 256
+; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 512
+; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 1024
+; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 2048
+; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 4096
+; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 8192
+; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 16384
+; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 32768
+; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 65536
+; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 131072
+; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v8, a2
+; RV64ZVBC32-NEXT:    lui a2, 262144
+; RV64ZVBC32-NEXT:    slli a1, a1, 31
+; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vand.vx v0, v8, a2
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, a1
+; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV64ZVBC32-NEXT:    vxor.vv v16, v24, v8
+; RV64ZVBC32-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV64ZVBC32-NEXT:    vnsrl.wx v8, v16, a0
+; RV64ZVBC32-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x i32> poison, i32 %b, i64 0
   %vb = shufflevector <vscale x 8 x i32> %elt.head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
   %va.ext = zext <vscale x 8 x i32> %va to <vscale x 8 x i64>
@@ -13399,17 +17345,2214 @@ define <vscale x 1 x i64> @clmulh_nxv1i64_vv(<vscale x 1 x i64> %va, <vscale x 1
 ; RV64V-NEXT:    addi sp, sp, 208
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmulh_nxv1i64_vv:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV32ZVBC-NEXT:    vclmulh.vv v8, v8, v9
-; RV32ZVBC-NEXT:    ret
+; RV32ZVBC64-LABEL: clmulh_nxv1i64_vv:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV32ZVBC64-NEXT:    vclmulh.vv v8, v8, v9
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmulh_nxv1i64_vv:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV64ZVBC64-NEXT:    vclmulh.vv v8, v8, v9
+; RV64ZVBC64-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmulh_nxv1i64_vv:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV64ZVBC-NEXT:    vclmulh.vv v8, v8, v9
-; RV64ZVBC-NEXT:    ret
+; RV32ZVBC32-LABEL: clmulh_nxv1i64_vv:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    addi sp, sp, -352
+; RV32ZVBC32-NEXT:    sw ra, 348(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s0, 344(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s1, 340(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s2, 336(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s3, 332(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s4, 328(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s5, 324(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s6, 320(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s7, 316(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s8, 312(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s9, 308(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s10, 304(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s11, 300(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    sub sp, sp, a0
+; RV32ZVBC32-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; RV32ZVBC32-NEXT:    vmv1r.v v10, v8
+; RV32ZVBC32-NEXT:    lui s7, 1044480
+; RV32ZVBC32-NEXT:    lui s1, 524288
+; RV32ZVBC32-NEXT:    li s11, 1
+; RV32ZVBC32-NEXT:    li s6, 2
+; RV32ZVBC32-NEXT:    li s5, 4
+; RV32ZVBC32-NEXT:    li s10, 8
+; RV32ZVBC32-NEXT:    li s9, 64
+; RV32ZVBC32-NEXT:    li s8, 128
+; RV32ZVBC32-NEXT:    li s4, 256
+; RV32ZVBC32-NEXT:    li s3, 512
+; RV32ZVBC32-NEXT:    li s2, 1024
+; RV32ZVBC32-NEXT:    lui s0, 1
+; RV32ZVBC32-NEXT:    lui t6, 2
+; RV32ZVBC32-NEXT:    lui t5, 4
+; RV32ZVBC32-NEXT:    lui t4, 8
+; RV32ZVBC32-NEXT:    lui t3, 16
+; RV32ZVBC32-NEXT:    lui t2, 32
+; RV32ZVBC32-NEXT:    lui t1, 64
+; RV32ZVBC32-NEXT:    lui t0, 128
+; RV32ZVBC32-NEXT:    lui a7, 256
+; RV32ZVBC32-NEXT:    lui a5, 512
+; RV32ZVBC32-NEXT:    lui a4, 1024
+; RV32ZVBC32-NEXT:    lui a2, 2048
+; RV32ZVBC32-NEXT:    lui a1, 4096
+; RV32ZVBC32-NEXT:    lui a0, 8192
+; RV32ZVBC32-NEXT:    lui a3, 16384
+; RV32ZVBC32-NEXT:    sw s7, 248(sp)
+; RV32ZVBC32-NEXT:    lui ra, 32768
+; RV32ZVBC32-NEXT:    sw zero, 252(sp)
+; RV32ZVBC32-NEXT:    sw s1, 8(sp)
+; RV32ZVBC32-NEXT:    lui a6, 524288
+; RV32ZVBC32-NEXT:    sw zero, 12(sp)
+; RV32ZVBC32-NEXT:    sw zero, 272(sp)
+; RV32ZVBC32-NEXT:    sw s11, 276(sp)
+; RV32ZVBC32-NEXT:    sw zero, 256(sp)
+; RV32ZVBC32-NEXT:    sw s6, 260(sp)
+; RV32ZVBC32-NEXT:    lui s6, 65536
+; RV32ZVBC32-NEXT:    sw zero, 264(sp)
+; RV32ZVBC32-NEXT:    sw s5, 268(sp)
+; RV32ZVBC32-NEXT:    lui s7, 131072
+; RV32ZVBC32-NEXT:    sw zero, 240(sp)
+; RV32ZVBC32-NEXT:    sw s10, 244(sp)
+; RV32ZVBC32-NEXT:    lui s5, 262144
+; RV32ZVBC32-NEXT:    sw zero, 232(sp)
+; RV32ZVBC32-NEXT:    li s1, 16
+; RV32ZVBC32-NEXT:    sw s1, 236(sp)
+; RV32ZVBC32-NEXT:    li s1, 16
+; RV32ZVBC32-NEXT:    sw zero, 224(sp)
+; RV32ZVBC32-NEXT:    li s10, 32
+; RV32ZVBC32-NEXT:    sw s10, 228(sp)
+; RV32ZVBC32-NEXT:    sw zero, 216(sp)
+; RV32ZVBC32-NEXT:    sw s9, 220(sp)
+; RV32ZVBC32-NEXT:    li s10, 64
+; RV32ZVBC32-NEXT:    sw zero, 208(sp)
+; RV32ZVBC32-NEXT:    sw s8, 212(sp)
+; RV32ZVBC32-NEXT:    sw zero, 200(sp)
+; RV32ZVBC32-NEXT:    sw s4, 204(sp)
+; RV32ZVBC32-NEXT:    sw zero, 192(sp)
+; RV32ZVBC32-NEXT:    sw s3, 196(sp)
+; RV32ZVBC32-NEXT:    li s9, 512
+; RV32ZVBC32-NEXT:    sw zero, 184(sp)
+; RV32ZVBC32-NEXT:    sw s2, 188(sp)
+; RV32ZVBC32-NEXT:    li s2, 1024
+; RV32ZVBC32-NEXT:    slli s11, s11, 11
+; RV32ZVBC32-NEXT:    sw zero, 176(sp)
+; RV32ZVBC32-NEXT:    sw s11, 180(sp)
+; RV32ZVBC32-NEXT:    sw zero, 168(sp)
+; RV32ZVBC32-NEXT:    sw s0, 172(sp)
+; RV32ZVBC32-NEXT:    sw zero, 160(sp)
+; RV32ZVBC32-NEXT:    sw t6, 164(sp)
+; RV32ZVBC32-NEXT:    sw zero, 152(sp)
+; RV32ZVBC32-NEXT:    sw t5, 156(sp)
+; RV32ZVBC32-NEXT:    lui s8, 4
+; RV32ZVBC32-NEXT:    sw zero, 144(sp)
+; RV32ZVBC32-NEXT:    sw t4, 148(sp)
+; RV32ZVBC32-NEXT:    lui t5, 8
+; RV32ZVBC32-NEXT:    sw zero, 136(sp)
+; RV32ZVBC32-NEXT:    sw t3, 140(sp)
+; RV32ZVBC32-NEXT:    lui s0, 16
+; RV32ZVBC32-NEXT:    sw zero, 128(sp)
+; RV32ZVBC32-NEXT:    sw t2, 132(sp)
+; RV32ZVBC32-NEXT:    sw zero, 120(sp)
+; RV32ZVBC32-NEXT:    sw t1, 124(sp)
+; RV32ZVBC32-NEXT:    lui t3, 64
+; RV32ZVBC32-NEXT:    sw zero, 112(sp)
+; RV32ZVBC32-NEXT:    sw t0, 116(sp)
+; RV32ZVBC32-NEXT:    lui s4, 128
+; RV32ZVBC32-NEXT:    sw zero, 104(sp)
+; RV32ZVBC32-NEXT:    sw a7, 108(sp)
+; RV32ZVBC32-NEXT:    lui t4, 256
+; RV32ZVBC32-NEXT:    sw zero, 96(sp)
+; RV32ZVBC32-NEXT:    sw a5, 100(sp)
+; RV32ZVBC32-NEXT:    lui t0, 512
+; RV32ZVBC32-NEXT:    sw zero, 88(sp)
+; RV32ZVBC32-NEXT:    sw a4, 92(sp)
+; RV32ZVBC32-NEXT:    lui a5, 1024
+; RV32ZVBC32-NEXT:    sw zero, 80(sp)
+; RV32ZVBC32-NEXT:    sw a2, 84(sp)
+; RV32ZVBC32-NEXT:    lui a4, 2048
+; RV32ZVBC32-NEXT:    sw zero, 72(sp)
+; RV32ZVBC32-NEXT:    sw a1, 76(sp)
+; RV32ZVBC32-NEXT:    lui t1, 4096
+; RV32ZVBC32-NEXT:    sw zero, 64(sp)
+; RV32ZVBC32-NEXT:    sw a0, 68(sp)
+; RV32ZVBC32-NEXT:    lui t6, 8192
+; RV32ZVBC32-NEXT:    sw zero, 56(sp)
+; RV32ZVBC32-NEXT:    sw a3, 60(sp)
+; RV32ZVBC32-NEXT:    lui s3, 16384
+; RV32ZVBC32-NEXT:    sw zero, 48(sp)
+; RV32ZVBC32-NEXT:    sw ra, 52(sp)
+; RV32ZVBC32-NEXT:    lui a7, 32768
+; RV32ZVBC32-NEXT:    sw zero, 40(sp)
+; RV32ZVBC32-NEXT:    sw s6, 44(sp)
+; RV32ZVBC32-NEXT:    sw zero, 32(sp)
+; RV32ZVBC32-NEXT:    sw s7, 36(sp)
+; RV32ZVBC32-NEXT:    sw zero, 24(sp)
+; RV32ZVBC32-NEXT:    sw s5, 28(sp)
+; RV32ZVBC32-NEXT:    sw zero, 16(sp)
+; RV32ZVBC32-NEXT:    sw a6, 20(sp)
+; RV32ZVBC32-NEXT:    lui a0, 61681
+; RV32ZVBC32-NEXT:    addi a0, a0, -241
+; RV32ZVBC32-NEXT:    vmv.v.x v14, a0
+; RV32ZVBC32-NEXT:    lui a0, 209715
+; RV32ZVBC32-NEXT:    addi a0, a0, 819
+; RV32ZVBC32-NEXT:    vmv.v.x v11, a0
+; RV32ZVBC32-NEXT:    lui a0, 349525
+; RV32ZVBC32-NEXT:    addi a0, a0, 1365
+; RV32ZVBC32-NEXT:    vmv.v.x v12, a0
+; RV32ZVBC32-NEXT:    addi a0, sp, 248
+; RV32ZVBC32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32ZVBC32-NEXT:    vlse64.v v13, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 8
+; RV32ZVBC32-NEXT:    vlse64.v v7, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 272
+; RV32ZVBC32-NEXT:    vlse64.v v31, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 256
+; RV32ZVBC32-NEXT:    vlse64.v v30, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 264
+; RV32ZVBC32-NEXT:    vlse64.v v29, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 240
+; RV32ZVBC32-NEXT:    vlse64.v v26, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 232
+; RV32ZVBC32-NEXT:    vlse64.v v23, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 224
+; RV32ZVBC32-NEXT:    vlse64.v v20, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 216
+; RV32ZVBC32-NEXT:    vlse64.v v17, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 208
+; RV32ZVBC32-NEXT:    vlse64.v v15, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 200
+; RV32ZVBC32-NEXT:    vlse64.v v8, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 288
+; RV32ZVBC32-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi a0, sp, 192
+; RV32ZVBC32-NEXT:    vlse64.v v18, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 184
+; RV32ZVBC32-NEXT:    vlse64.v v16, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 176
+; RV32ZVBC32-NEXT:    vlse64.v v19, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 168
+; RV32ZVBC32-NEXT:    vlse64.v v21, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 160
+; RV32ZVBC32-NEXT:    vlse64.v v22, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 152
+; RV32ZVBC32-NEXT:    vlse64.v v24, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 144
+; RV32ZVBC32-NEXT:    vlse64.v v28, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 136
+; RV32ZVBC32-NEXT:    vlse64.v v25, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 128
+; RV32ZVBC32-NEXT:    vlse64.v v27, (a0), zero
+; RV32ZVBC32-NEXT:    li ra, 56
+; RV32ZVBC32-NEXT:    vsrl.vi v6, v10, 24
+; RV32ZVBC32-NEXT:    vsrl.vi v5, v10, 8
+; RV32ZVBC32-NEXT:    vsrl.vx v4, v10, ra
+; RV32ZVBC32-NEXT:    li a2, 40
+; RV32ZVBC32-NEXT:    vsrl.vx v3, v10, a2
+; RV32ZVBC32-NEXT:    vsll.vx v2, v10, ra
+; RV32ZVBC32-NEXT:    vsrl.vx v1, v9, ra
+; RV32ZVBC32-NEXT:    vsrl.vx v0, v9, a2
+; RV32ZVBC32-NEXT:    addi a0, s0, -256
+; RV32ZVBC32-NEXT:    vand.vx v3, v3, a0
+; RV32ZVBC32-NEXT:    vor.vv v3, v3, v4
+; RV32ZVBC32-NEXT:    vsll.vx v4, v9, ra
+; RV32ZVBC32-NEXT:    vand.vx v0, v0, a0
+; RV32ZVBC32-NEXT:    vor.vv v1, v0, v1
+; RV32ZVBC32-NEXT:    vand.vx v0, v10, a0
+; RV32ZVBC32-NEXT:    vsll.vx v0, v0, a2
+; RV32ZVBC32-NEXT:    vor.vv v2, v2, v0
+; RV32ZVBC32-NEXT:    vand.vx v0, v9, a0
+; RV32ZVBC32-NEXT:    vsll.vx v0, v0, a2
+; RV32ZVBC32-NEXT:    vor.vv v4, v4, v0
+; RV32ZVBC32-NEXT:    vsrl.vi v0, v9, 24
+; RV32ZVBC32-NEXT:    lui a1, 4080
+; RV32ZVBC32-NEXT:    vand.vx v6, v6, a1
+; RV32ZVBC32-NEXT:    vand.vv v5, v5, v13
+; RV32ZVBC32-NEXT:    vor.vv v8, v5, v6
+; RV32ZVBC32-NEXT:    vsrl.vi v6, v9, 8
+; RV32ZVBC32-NEXT:    vand.vx v5, v0, a1
+; RV32ZVBC32-NEXT:    vand.vv v6, v6, v13
+; RV32ZVBC32-NEXT:    vor.vv v5, v6, v5
+; RV32ZVBC32-NEXT:    addi a3, sp, 120
+; RV32ZVBC32-NEXT:    vlse64.v v6, (a3), zero
+; RV32ZVBC32-NEXT:    vor.vv v1, v5, v1
+; RV32ZVBC32-NEXT:    vand.vx v5, v9, a1
+; RV32ZVBC32-NEXT:    vsll.vi v5, v5, 24
+; RV32ZVBC32-NEXT:    vand.vv v9, v9, v13
+; RV32ZVBC32-NEXT:    vsll.vi v9, v9, 8
+; RV32ZVBC32-NEXT:    vor.vv v9, v5, v9
+; RV32ZVBC32-NEXT:    addi a3, sp, 112
+; RV32ZVBC32-NEXT:    vlse64.v v5, (a3), zero
+; RV32ZVBC32-NEXT:    vor.vv v9, v4, v9
+; RV32ZVBC32-NEXT:    addi a3, sp, 104
+; RV32ZVBC32-NEXT:    vlse64.v v4, (a3), zero
+; RV32ZVBC32-NEXT:    vor.vv v9, v9, v1
+; RV32ZVBC32-NEXT:    vsrl.vi v1, v9, 4
+; RV32ZVBC32-NEXT:    vand.vv v9, v9, v14
+; RV32ZVBC32-NEXT:    vand.vv v1, v1, v14
+; RV32ZVBC32-NEXT:    vsll.vi v9, v9, 4
+; RV32ZVBC32-NEXT:    vor.vv v9, v1, v9
+; RV32ZVBC32-NEXT:    vsrl.vi v1, v9, 2
+; RV32ZVBC32-NEXT:    vand.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vand.vv v1, v1, v11
+; RV32ZVBC32-NEXT:    vsll.vi v9, v9, 2
+; RV32ZVBC32-NEXT:    vor.vv v9, v1, v9
+; RV32ZVBC32-NEXT:    vsrl.vi v1, v9, 1
+; RV32ZVBC32-NEXT:    vand.vv v9, v9, v12
+; RV32ZVBC32-NEXT:    vand.vv v1, v1, v12
+; RV32ZVBC32-NEXT:    vadd.vv v9, v9, v9
+; RV32ZVBC32-NEXT:    vor.vv v9, v1, v9
+; RV32ZVBC32-NEXT:    vand.vx v1, v9, s1
+; RV32ZVBC32-NEXT:    vor.vv v8, v8, v3
+; RV32ZVBC32-NEXT:    vand.vx v3, v10, a1
+; RV32ZVBC32-NEXT:    vsll.vi v3, v3, 24
+; RV32ZVBC32-NEXT:    vand.vv v10, v10, v13
+; RV32ZVBC32-NEXT:    vsll.vi v10, v10, 8
+; RV32ZVBC32-NEXT:    vor.vv v10, v3, v10
+; RV32ZVBC32-NEXT:    li a3, 32
+; RV32ZVBC32-NEXT:    vand.vx v3, v9, a3
+; RV32ZVBC32-NEXT:    vor.vv v10, v2, v10
+; RV32ZVBC32-NEXT:    vor.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    vsrl.vi v10, v8, 4
+; RV32ZVBC32-NEXT:    vand.vv v8, v8, v14
+; RV32ZVBC32-NEXT:    vand.vv v10, v10, v14
+; RV32ZVBC32-NEXT:    vsll.vi v8, v8, 4
+; RV32ZVBC32-NEXT:    vor.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    vsrl.vi v10, v8, 2
+; RV32ZVBC32-NEXT:    vand.vv v8, v8, v11
+; RV32ZVBC32-NEXT:    vand.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vsll.vi v8, v8, 2
+; RV32ZVBC32-NEXT:    vor.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    vsrl.vi v10, v8, 1
+; RV32ZVBC32-NEXT:    vand.vv v8, v8, v12
+; RV32ZVBC32-NEXT:    vand.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    vadd.vv v8, v8, v8
+; RV32ZVBC32-NEXT:    vor.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    vand.vi v10, v9, 2
+; RV32ZVBC32-NEXT:    vand.vi v2, v9, 1
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV32ZVBC32-NEXT:    vmul.vv v2, v8, v2
+; RV32ZVBC32-NEXT:    vxor.vv v10, v2, v10
+; RV32ZVBC32-NEXT:    vand.vi v2, v9, 4
+; RV32ZVBC32-NEXT:    vmul.vv v2, v8, v2
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v2
+; RV32ZVBC32-NEXT:    vand.vi v2, v9, 8
+; RV32ZVBC32-NEXT:    vmul.vv v2, v8, v2
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v2
+; RV32ZVBC32-NEXT:    vand.vx v2, v9, s10
+; RV32ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v1
+; RV32ZVBC32-NEXT:    li a3, 128
+; RV32ZVBC32-NEXT:    vand.vx v1, v9, a3
+; RV32ZVBC32-NEXT:    vmul.vv v3, v8, v3
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v3
+; RV32ZVBC32-NEXT:    li a3, 256
+; RV32ZVBC32-NEXT:    vand.vx v0, v9, a3
+; RV32ZVBC32-NEXT:    vmul.vv v3, v8, v2
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v3
+; RV32ZVBC32-NEXT:    vand.vx v2, v9, s9
+; RV32ZVBC32-NEXT:    vmul.vv v3, v8, v1
+; RV32ZVBC32-NEXT:    vxor.vv v3, v10, v3
+; RV32ZVBC32-NEXT:    vand.vx v10, v9, s2
+; RV32ZVBC32-NEXT:    vmul.vv v1, v8, v0
+; RV32ZVBC32-NEXT:    vmul.vv v2, v8, v2
+; RV32ZVBC32-NEXT:    vxor.vv v1, v3, v1
+; RV32ZVBC32-NEXT:    vxor.vv v2, v1, v2
+; RV32ZVBC32-NEXT:    vand.vx v1, v9, s11
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV32ZVBC32-NEXT:    vxor.vv v10, v2, v10
+; RV32ZVBC32-NEXT:    lui a3, 1
+; RV32ZVBC32-NEXT:    vand.vx v2, v9, a3
+; RV32ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v1
+; RV32ZVBC32-NEXT:    lui a3, 2
+; RV32ZVBC32-NEXT:    vand.vx v1, v9, a3
+; RV32ZVBC32-NEXT:    vmul.vv v2, v8, v2
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v2
+; RV32ZVBC32-NEXT:    vand.vx v2, v9, s8
+; RV32ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v1
+; RV32ZVBC32-NEXT:    vand.vx v1, v9, t5
+; RV32ZVBC32-NEXT:    vmul.vv v2, v8, v2
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v2
+; RV32ZVBC32-NEXT:    vand.vx v2, v9, s0
+; RV32ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v1
+; RV32ZVBC32-NEXT:    vand.vx v1, v9, t2
+; RV32ZVBC32-NEXT:    vmul.vv v2, v8, v2
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v2
+; RV32ZVBC32-NEXT:    vand.vx v2, v9, t3
+; RV32ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v1
+; RV32ZVBC32-NEXT:    vand.vx v1, v9, s4
+; RV32ZVBC32-NEXT:    vmul.vv v2, v8, v2
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v2
+; RV32ZVBC32-NEXT:    vand.vx v2, v9, t4
+; RV32ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v1
+; RV32ZVBC32-NEXT:    vand.vx v1, v9, t0
+; RV32ZVBC32-NEXT:    vmul.vv v2, v8, v2
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v2
+; RV32ZVBC32-NEXT:    vand.vx v2, v9, a5
+; RV32ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v1
+; RV32ZVBC32-NEXT:    vand.vx v1, v9, a4
+; RV32ZVBC32-NEXT:    vmul.vv v2, v8, v2
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v2
+; RV32ZVBC32-NEXT:    vand.vx v0, v9, t1
+; RV32ZVBC32-NEXT:    vmul.vv v2, v8, v1
+; RV32ZVBC32-NEXT:    vxor.vv v2, v10, v2
+; RV32ZVBC32-NEXT:    vand.vx v10, v9, t6
+; RV32ZVBC32-NEXT:    vmul.vv v1, v8, v0
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV32ZVBC32-NEXT:    vxor.vv v1, v2, v1
+; RV32ZVBC32-NEXT:    vxor.vv v10, v1, v10
+; RV32ZVBC32-NEXT:    vand.vx v1, v9, s3
+; RV32ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v1
+; RV32ZVBC32-NEXT:    vand.vx v1, v9, a7
+; RV32ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v1
+; RV32ZVBC32-NEXT:    vand.vx v1, v9, s6
+; RV32ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v1
+; RV32ZVBC32-NEXT:    vand.vx v1, v9, s7
+; RV32ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v1
+; RV32ZVBC32-NEXT:    vand.vx v1, v9, s5
+; RV32ZVBC32-NEXT:    addi a3, sp, 96
+; RV32ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v1
+; RV32ZVBC32-NEXT:    vlse64.v v1, (a3), zero
+; RV32ZVBC32-NEXT:    addi a3, sp, 88
+; RV32ZVBC32-NEXT:    vand.vv v7, v9, v7
+; RV32ZVBC32-NEXT:    vmul.vv v7, v8, v7
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v7
+; RV32ZVBC32-NEXT:    vlse64.v v7, (a3), zero
+; RV32ZVBC32-NEXT:    addi a3, sp, 80
+; RV32ZVBC32-NEXT:    vand.vv v31, v9, v31
+; RV32ZVBC32-NEXT:    vmul.vv v31, v8, v31
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v31
+; RV32ZVBC32-NEXT:    vlse64.v v31, (a3), zero
+; RV32ZVBC32-NEXT:    addi a3, sp, 72
+; RV32ZVBC32-NEXT:    vand.vv v30, v9, v30
+; RV32ZVBC32-NEXT:    vmul.vv v30, v8, v30
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v30
+; RV32ZVBC32-NEXT:    vlse64.v v30, (a3), zero
+; RV32ZVBC32-NEXT:    addi a3, sp, 64
+; RV32ZVBC32-NEXT:    vand.vv v29, v9, v29
+; RV32ZVBC32-NEXT:    vmul.vv v29, v8, v29
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v29
+; RV32ZVBC32-NEXT:    vlse64.v v29, (a3), zero
+; RV32ZVBC32-NEXT:    addi a3, sp, 56
+; RV32ZVBC32-NEXT:    vand.vv v26, v9, v26
+; RV32ZVBC32-NEXT:    vmul.vv v26, v8, v26
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v26
+; RV32ZVBC32-NEXT:    vlse64.v v26, (a3), zero
+; RV32ZVBC32-NEXT:    addi a3, sp, 48
+; RV32ZVBC32-NEXT:    vand.vv v23, v9, v23
+; RV32ZVBC32-NEXT:    vmul.vv v23, v8, v23
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v23
+; RV32ZVBC32-NEXT:    vlse64.v v23, (a3), zero
+; RV32ZVBC32-NEXT:    addi a3, sp, 40
+; RV32ZVBC32-NEXT:    vand.vv v20, v9, v20
+; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v20
+; RV32ZVBC32-NEXT:    vlse64.v v20, (a3), zero
+; RV32ZVBC32-NEXT:    addi a3, sp, 32
+; RV32ZVBC32-NEXT:    vand.vv v17, v9, v17
+; RV32ZVBC32-NEXT:    vmul.vv v17, v8, v17
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v17
+; RV32ZVBC32-NEXT:    vlse64.v v17, (a3), zero
+; RV32ZVBC32-NEXT:    addi a3, sp, 24
+; RV32ZVBC32-NEXT:    vand.vv v15, v9, v15
+; RV32ZVBC32-NEXT:    vmul.vv v15, v8, v15
+; RV32ZVBC32-NEXT:    vxor.vv v15, v10, v15
+; RV32ZVBC32-NEXT:    vlse64.v v0, (a3), zero
+; RV32ZVBC32-NEXT:    addi a3, sp, 16
+; RV32ZVBC32-NEXT:    addi a4, sp, 288
+; RV32ZVBC32-NEXT:    vl1r.v v10, (a4) # vscale x 8-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v10, v9, v10
+; RV32ZVBC32-NEXT:    vand.vv v18, v9, v18
+; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV32ZVBC32-NEXT:    vmul.vv v18, v8, v18
+; RV32ZVBC32-NEXT:    vxor.vv v10, v15, v10
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v18
+; RV32ZVBC32-NEXT:    vlse64.v v18, (a3), zero
+; RV32ZVBC32-NEXT:    vand.vv v16, v9, v16
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v16
+; RV32ZVBC32-NEXT:    vand.vv v16, v9, v19
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v16
+; RV32ZVBC32-NEXT:    vand.vv v16, v9, v21
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v16
+; RV32ZVBC32-NEXT:    vand.vv v16, v9, v22
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v16
+; RV32ZVBC32-NEXT:    vand.vv v16, v9, v24
+; RV32ZVBC32-NEXT:    vand.vv v19, v9, v28
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vmul.vv v19, v8, v19
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v16
+; RV32ZVBC32-NEXT:    vand.vx v16, v15, a1
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v19
+; RV32ZVBC32-NEXT:    vsrl.vi v19, v10, 24
+; RV32ZVBC32-NEXT:    vand.vx v19, v19, a1
+; RV32ZVBC32-NEXT:    vand.vv v21, v9, v25
+; RV32ZVBC32-NEXT:    vmul.vv v21, v8, v21
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v21
+; RV32ZVBC32-NEXT:    vand.vv v21, v9, v27
+; RV32ZVBC32-NEXT:    vmul.vv v21, v8, v21
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v21
+; RV32ZVBC32-NEXT:    vand.vv v21, v9, v6
+; RV32ZVBC32-NEXT:    vmul.vv v21, v8, v21
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v21
+; RV32ZVBC32-NEXT:    vand.vv v21, v9, v5
+; RV32ZVBC32-NEXT:    vmul.vv v21, v8, v21
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v21
+; RV32ZVBC32-NEXT:    vand.vv v21, v9, v4
+; RV32ZVBC32-NEXT:    vmul.vv v21, v8, v21
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v21
+; RV32ZVBC32-NEXT:    vand.vv v21, v9, v1
+; RV32ZVBC32-NEXT:    vmul.vv v21, v8, v21
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v21
+; RV32ZVBC32-NEXT:    vand.vv v21, v9, v7
+; RV32ZVBC32-NEXT:    vmul.vv v21, v8, v21
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v21
+; RV32ZVBC32-NEXT:    vand.vv v21, v9, v31
+; RV32ZVBC32-NEXT:    vmul.vv v21, v8, v21
+; RV32ZVBC32-NEXT:    vand.vx v22, v2, a0
+; RV32ZVBC32-NEXT:    vsll.vx v22, v22, a2
+; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v21
+; RV32ZVBC32-NEXT:    vsrl.vx v21, v10, a2
+; RV32ZVBC32-NEXT:    vand.vx v21, v21, a0
+; RV32ZVBC32-NEXT:    vand.vv v24, v9, v30
+; RV32ZVBC32-NEXT:    vand.vv v25, v9, v29
+; RV32ZVBC32-NEXT:    vand.vv v26, v9, v26
+; RV32ZVBC32-NEXT:    vand.vv v23, v9, v23
+; RV32ZVBC32-NEXT:    vand.vv v20, v9, v20
+; RV32ZVBC32-NEXT:    vand.vv v17, v9, v17
+; RV32ZVBC32-NEXT:    vand.vv v27, v9, v0
+; RV32ZVBC32-NEXT:    vand.vv v9, v9, v18
+; RV32ZVBC32-NEXT:    vmul.vv v18, v8, v24
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v25
+; RV32ZVBC32-NEXT:    vmul.vv v25, v8, v26
+; RV32ZVBC32-NEXT:    vmul.vv v23, v8, v23
+; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC32-NEXT:    vmul.vv v17, v8, v17
+; RV32ZVBC32-NEXT:    vmul.vv v26, v8, v27
+; RV32ZVBC32-NEXT:    vmul.vv v8, v8, v9
+; RV32ZVBC32-NEXT:    vxor.vv v9, v10, v18
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v24
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v25
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v23
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v20
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v17
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v26
+; RV32ZVBC32-NEXT:    vxor.vv v8, v9, v8
+; RV32ZVBC32-NEXT:    vsll.vx v9, v3, ra
+; RV32ZVBC32-NEXT:    vsrl.vx v8, v8, ra
+; RV32ZVBC32-NEXT:    vor.vv v9, v9, v22
+; RV32ZVBC32-NEXT:    vsrl.vi v15, v15, 8
+; RV32ZVBC32-NEXT:    vsll.vi v16, v16, 24
+; RV32ZVBC32-NEXT:    vand.vv v15, v15, v13
+; RV32ZVBC32-NEXT:    vor.vv v15, v15, v19
+; RV32ZVBC32-NEXT:    vand.vv v10, v10, v13
+; RV32ZVBC32-NEXT:    vsll.vi v10, v10, 8
+; RV32ZVBC32-NEXT:    vor.vv v10, v16, v10
+; RV32ZVBC32-NEXT:    vor.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vor.vv v8, v21, v8
+; RV32ZVBC32-NEXT:    vor.vv v8, v15, v8
+; RV32ZVBC32-NEXT:    vor.vv v8, v9, v8
+; RV32ZVBC32-NEXT:    vsrl.vi v9, v8, 4
+; RV32ZVBC32-NEXT:    vand.vv v8, v8, v14
+; RV32ZVBC32-NEXT:    vand.vv v9, v9, v14
+; RV32ZVBC32-NEXT:    vsll.vi v8, v8, 4
+; RV32ZVBC32-NEXT:    vor.vv v8, v9, v8
+; RV32ZVBC32-NEXT:    vsrl.vi v9, v8, 2
+; RV32ZVBC32-NEXT:    vand.vv v8, v8, v11
+; RV32ZVBC32-NEXT:    vand.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vsll.vi v8, v8, 2
+; RV32ZVBC32-NEXT:    vor.vv v8, v9, v8
+; RV32ZVBC32-NEXT:    vsrl.vi v9, v8, 1
+; RV32ZVBC32-NEXT:    vand.vv v8, v8, v12
+; RV32ZVBC32-NEXT:    vand.vv v9, v9, v12
+; RV32ZVBC32-NEXT:    vadd.vv v8, v8, v8
+; RV32ZVBC32-NEXT:    vor.vv v8, v9, v8
+; RV32ZVBC32-NEXT:    vsrl.vi v8, v8, 1
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    add sp, sp, a0
+; RV32ZVBC32-NEXT:    lw ra, 348(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s0, 344(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s1, 340(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s2, 336(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s3, 332(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s4, 328(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s5, 324(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s6, 320(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s7, 316(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s8, 312(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s9, 308(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s10, 304(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s11, 300(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    addi sp, sp, 352
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmulh_nxv1i64_vv:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    addi sp, sp, -208
+; RV64ZVBC32-NEXT:    sd ra, 200(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    sd s0, 192(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    sd s1, 184(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    sd s2, 176(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    sd s3, 168(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    sd s4, 160(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    sd s5, 152(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    sd s6, 144(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    sd s7, 136(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    sd s8, 128(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    sd s9, 120(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    sd s10, 112(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    sd s11, 104(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    sub sp, sp, a0
+; RV64ZVBC32-NEXT:    li s6, 56
+; RV64ZVBC32-NEXT:    lui t4, 16
+; RV64ZVBC32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV64ZVBC32-NEXT:    vsrl.vi v16, v8, 24
+; RV64ZVBC32-NEXT:    vsrl.vi v10, v8, 8
+; RV64ZVBC32-NEXT:    li t3, 255
+; RV64ZVBC32-NEXT:    lui a0, 61681
+; RV64ZVBC32-NEXT:    lui a1, 209715
+; RV64ZVBC32-NEXT:    lui a5, 349525
+; RV64ZVBC32-NEXT:    vsrl.vi v12, v9, 24
+; RV64ZVBC32-NEXT:    vsrl.vi v11, v9, 8
+; RV64ZVBC32-NEXT:    li ra, 16
+; RV64ZVBC32-NEXT:    li s11, 32
+; RV64ZVBC32-NEXT:    li s10, 64
+; RV64ZVBC32-NEXT:    li s8, 128
+; RV64ZVBC32-NEXT:    li s9, 256
+; RV64ZVBC32-NEXT:    li a3, 512
+; RV64ZVBC32-NEXT:    li a4, 1024
+; RV64ZVBC32-NEXT:    li t0, 1
+; RV64ZVBC32-NEXT:    lui s5, 1
+; RV64ZVBC32-NEXT:    lui s4, 2
+; RV64ZVBC32-NEXT:    lui s3, 4
+; RV64ZVBC32-NEXT:    lui a7, 8
+; RV64ZVBC32-NEXT:    lui t1, 32
+; RV64ZVBC32-NEXT:    lui t2, 64
+; RV64ZVBC32-NEXT:    lui s1, 128
+; RV64ZVBC32-NEXT:    lui s2, 256
+; RV64ZVBC32-NEXT:    addi t5, a0, -241
+; RV64ZVBC32-NEXT:    addi t6, a1, 819
+; RV64ZVBC32-NEXT:    addi s0, a5, 1365
+; RV64ZVBC32-NEXT:    slli a0, t5, 32
+; RV64ZVBC32-NEXT:    add t5, t5, a0
+; RV64ZVBC32-NEXT:    slli a0, t6, 32
+; RV64ZVBC32-NEXT:    add t6, t6, a0
+; RV64ZVBC32-NEXT:    slli a0, s0, 32
+; RV64ZVBC32-NEXT:    add s0, s0, a0
+; RV64ZVBC32-NEXT:    addi t4, t4, -256
+; RV64ZVBC32-NEXT:    lui a2, 16
+; RV64ZVBC32-NEXT:    slli t3, t3, 24
+; RV64ZVBC32-NEXT:    vsrl.vx v13, v9, s6
+; RV64ZVBC32-NEXT:    li a0, 40
+; RV64ZVBC32-NEXT:    vsrl.vx v14, v9, a0
+; RV64ZVBC32-NEXT:    lui a1, 4080
+; RV64ZVBC32-NEXT:    vand.vx v12, v12, a1
+; RV64ZVBC32-NEXT:    vand.vx v15, v9, a1
+; RV64ZVBC32-NEXT:    vsll.vx v17, v9, s6
+; RV64ZVBC32-NEXT:    vand.vx v14, v14, t4
+; RV64ZVBC32-NEXT:    vand.vx v11, v11, t3
+; RV64ZVBC32-NEXT:    vsll.vi v15, v15, 24
+; RV64ZVBC32-NEXT:    vand.vx v18, v9, t3
+; RV64ZVBC32-NEXT:    vand.vx v9, v9, t4
+; RV64ZVBC32-NEXT:    vor.vv v13, v14, v13
+; RV64ZVBC32-NEXT:    vor.vv v11, v11, v12
+; RV64ZVBC32-NEXT:    vsll.vi v12, v18, 8
+; RV64ZVBC32-NEXT:    vsll.vx v9, v9, a0
+; RV64ZVBC32-NEXT:    li a5, 40
+; RV64ZVBC32-NEXT:    vor.vv v11, v11, v13
+; RV64ZVBC32-NEXT:    vor.vv v12, v15, v12
+; RV64ZVBC32-NEXT:    vor.vv v9, v17, v9
+; RV64ZVBC32-NEXT:    vor.vv v9, v9, v12
+; RV64ZVBC32-NEXT:    vor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vsrl.vi v11, v9, 4
+; RV64ZVBC32-NEXT:    vand.vx v9, v9, t5
+; RV64ZVBC32-NEXT:    vand.vx v11, v11, t5
+; RV64ZVBC32-NEXT:    vsll.vi v9, v9, 4
+; RV64ZVBC32-NEXT:    vor.vv v9, v11, v9
+; RV64ZVBC32-NEXT:    vsrl.vi v11, v9, 2
+; RV64ZVBC32-NEXT:    vand.vx v9, v9, t6
+; RV64ZVBC32-NEXT:    vand.vx v11, v11, t6
+; RV64ZVBC32-NEXT:    vsll.vi v9, v9, 2
+; RV64ZVBC32-NEXT:    vor.vv v9, v11, v9
+; RV64ZVBC32-NEXT:    vsrl.vi v11, v9, 1
+; RV64ZVBC32-NEXT:    vand.vx v9, v9, s0
+; RV64ZVBC32-NEXT:    vand.vx v11, v11, s0
+; RV64ZVBC32-NEXT:    vadd.vv v9, v9, v9
+; RV64ZVBC32-NEXT:    vor.vv v11, v11, v9
+; RV64ZVBC32-NEXT:    vand.vx v14, v11, ra
+; RV64ZVBC32-NEXT:    lui a0, 4096
+; RV64ZVBC32-NEXT:    vand.vx v15, v11, s11
+; RV64ZVBC32-NEXT:    lui a1, 8192
+; RV64ZVBC32-NEXT:    vand.vx v17, v11, s10
+; RV64ZVBC32-NEXT:    lui a6, 16384
+; RV64ZVBC32-NEXT:    vand.vx v18, v11, s8
+; RV64ZVBC32-NEXT:    lui s6, 32768
+; RV64ZVBC32-NEXT:    vand.vx v19, v11, s9
+; RV64ZVBC32-NEXT:    lui s7, 65536
+; RV64ZVBC32-NEXT:    vand.vx v20, v11, a3
+; RV64ZVBC32-NEXT:    lui s8, 131072
+; RV64ZVBC32-NEXT:    vand.vx v21, v11, a4
+; RV64ZVBC32-NEXT:    slli a3, t0, 11
+; RV64ZVBC32-NEXT:    vand.vx v22, v11, a3
+; RV64ZVBC32-NEXT:    lui s10, 262144
+; RV64ZVBC32-NEXT:    li a3, 56
+; RV64ZVBC32-NEXT:    vsrl.vx v4, v8, a3
+; RV64ZVBC32-NEXT:    vsrl.vx v0, v8, a5
+; RV64ZVBC32-NEXT:    li a5, 40
+; RV64ZVBC32-NEXT:    lui a4, 4080
+; RV64ZVBC32-NEXT:    vand.vx v1, v16, a4
+; RV64ZVBC32-NEXT:    vand.vx v3, v8, a4
+; RV64ZVBC32-NEXT:    vsll.vx v2, v8, a3
+; RV64ZVBC32-NEXT:    vand.vx v23, v11, s5
+; RV64ZVBC32-NEXT:    slli s11, t0, 31
+; RV64ZVBC32-NEXT:    vand.vx v24, v11, s4
+; RV64ZVBC32-NEXT:    slli ra, t0, 32
+; RV64ZVBC32-NEXT:    vand.vx v25, v11, s3
+; RV64ZVBC32-NEXT:    slli s9, t0, 33
+; RV64ZVBC32-NEXT:    vand.vx v26, v11, a7
+; RV64ZVBC32-NEXT:    slli a3, t0, 34
+; RV64ZVBC32-NEXT:    sd a3, 80(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v27, v11, a2
+; RV64ZVBC32-NEXT:    slli a2, t0, 35
+; RV64ZVBC32-NEXT:    sd a2, 72(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v28, v11, t1
+; RV64ZVBC32-NEXT:    slli a2, t0, 36
+; RV64ZVBC32-NEXT:    sd a2, 64(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v29, v11, t2
+; RV64ZVBC32-NEXT:    slli a2, t0, 37
+; RV64ZVBC32-NEXT:    sd a2, 56(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v30, v11, s1
+; RV64ZVBC32-NEXT:    slli a2, t0, 38
+; RV64ZVBC32-NEXT:    sd a2, 48(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v31, v11, s2
+; RV64ZVBC32-NEXT:    slli a2, t0, 39
+; RV64ZVBC32-NEXT:    sd a2, 40(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    lui a2, 512
+; RV64ZVBC32-NEXT:    vand.vx v7, v11, a2
+; RV64ZVBC32-NEXT:    slli a2, t0, 40
+; RV64ZVBC32-NEXT:    sd a2, 32(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    lui a2, 1024
+; RV64ZVBC32-NEXT:    vand.vx v6, v11, a2
+; RV64ZVBC32-NEXT:    slli a2, t0, 41
+; RV64ZVBC32-NEXT:    sd a2, 24(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    lui a2, 2048
+; RV64ZVBC32-NEXT:    vand.vx v9, v11, a2
+; RV64ZVBC32-NEXT:    csrr a2, vlenb
+; RV64ZVBC32-NEXT:    mv a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 2
+; RV64ZVBC32-NEXT:    add a3, a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 2
+; RV64ZVBC32-NEXT:    add a2, a2, a3
+; RV64ZVBC32-NEXT:    add a2, sp, a2
+; RV64ZVBC32-NEXT:    addi a2, a2, 96
+; RV64ZVBC32-NEXT:    vs1r.v v9, (a2) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli s2, t0, 42
+; RV64ZVBC32-NEXT:    vand.vx v9, v11, a0
+; RV64ZVBC32-NEXT:    sd s2, 8(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a2, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a2, a2, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a2
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli s3, t0, 43
+; RV64ZVBC32-NEXT:    vand.vx v9, v11, a1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli s4, t0, 44
+; RV64ZVBC32-NEXT:    vand.vx v9, v11, a6
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli s5, t0, 45
+; RV64ZVBC32-NEXT:    vand.vx v9, v11, s6
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli s6, t0, 46
+; RV64ZVBC32-NEXT:    vand.vx v9, v0, t4
+; RV64ZVBC32-NEXT:    vor.vv v9, v9, v4
+; RV64ZVBC32-NEXT:    vand.vx v12, v11, s7
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vs1r.v v12, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli s7, t0, 47
+; RV64ZVBC32-NEXT:    vand.vx v10, v10, t3
+; RV64ZVBC32-NEXT:    vor.vv v10, v10, v1
+; RV64ZVBC32-NEXT:    vand.vx v12, v11, s8
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vs1r.v v12, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli s8, t0, 48
+; RV64ZVBC32-NEXT:    vsll.vi v12, v3, 24
+; RV64ZVBC32-NEXT:    vor.vv v9, v10, v9
+; RV64ZVBC32-NEXT:    vand.vx v10, v8, t3
+; RV64ZVBC32-NEXT:    vsll.vi v10, v10, 8
+; RV64ZVBC32-NEXT:    vor.vv v10, v12, v10
+; RV64ZVBC32-NEXT:    vand.vx v12, v11, s10
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vs1r.v v12, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli s10, t0, 49
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, t4
+; RV64ZVBC32-NEXT:    vsll.vx v8, v8, a5
+; RV64ZVBC32-NEXT:    vor.vv v8, v2, v8
+; RV64ZVBC32-NEXT:    vand.vx v12, v11, s11
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vs1r.v v12, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli s11, t0, 50
+; RV64ZVBC32-NEXT:    vor.vv v8, v8, v10
+; RV64ZVBC32-NEXT:    vand.vx v10, v11, ra
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli ra, t0, 51
+; RV64ZVBC32-NEXT:    vor.vv v8, v8, v9
+; RV64ZVBC32-NEXT:    vsrl.vi v9, v8, 4
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, t5
+; RV64ZVBC32-NEXT:    vand.vx v9, v9, t5
+; RV64ZVBC32-NEXT:    vsll.vi v8, v8, 4
+; RV64ZVBC32-NEXT:    vor.vv v8, v9, v8
+; RV64ZVBC32-NEXT:    vsrl.vi v9, v8, 2
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, t6
+; RV64ZVBC32-NEXT:    vand.vx v9, v9, t6
+; RV64ZVBC32-NEXT:    vsll.vi v8, v8, 2
+; RV64ZVBC32-NEXT:    vor.vv v8, v9, v8
+; RV64ZVBC32-NEXT:    vsrl.vi v9, v8, 1
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, s0
+; RV64ZVBC32-NEXT:    vand.vx v9, v9, s0
+; RV64ZVBC32-NEXT:    vadd.vv v8, v8, v8
+; RV64ZVBC32-NEXT:    vor.vv v8, v9, v8
+; RV64ZVBC32-NEXT:    vand.vx v9, v11, s9
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a1, a0, 5
+; RV64ZVBC32-NEXT:    add a0, a1, a0
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli t2, t0, 52
+; RV64ZVBC32-NEXT:    slli t1, t0, 53
+; RV64ZVBC32-NEXT:    slli a7, t0, 54
+; RV64ZVBC32-NEXT:    slli s9, t0, 55
+; RV64ZVBC32-NEXT:    slli a6, t0, 56
+; RV64ZVBC32-NEXT:    slli a5, t0, 57
+; RV64ZVBC32-NEXT:    slli a4, t0, 58
+; RV64ZVBC32-NEXT:    slli a2, t0, 59
+; RV64ZVBC32-NEXT:    slli a1, t0, 60
+; RV64ZVBC32-NEXT:    slli a3, t0, 61
+; RV64ZVBC32-NEXT:    slli t0, t0, 62
+; RV64ZVBC32-NEXT:    li a0, -1
+; RV64ZVBC32-NEXT:    slli a0, a0, 63
+; RV64ZVBC32-NEXT:    ld s1, 80(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v9, v11, s1
+; RV64ZVBC32-NEXT:    csrr s1, vlenb
+; RV64ZVBC32-NEXT:    slli s1, s1, 5
+; RV64ZVBC32-NEXT:    add s1, sp, s1
+; RV64ZVBC32-NEXT:    addi s1, s1, 96
+; RV64ZVBC32-NEXT:    vs1r.v v9, (s1) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld s1, 72(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v9, v11, s1
+; RV64ZVBC32-NEXT:    csrr s1, vlenb
+; RV64ZVBC32-NEXT:    slli s2, s1, 5
+; RV64ZVBC32-NEXT:    sub s1, s2, s1
+; RV64ZVBC32-NEXT:    add s1, sp, s1
+; RV64ZVBC32-NEXT:    addi s1, s1, 96
+; RV64ZVBC32-NEXT:    vs1r.v v9, (s1) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld s1, 64(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v9, v11, s1
+; RV64ZVBC32-NEXT:    csrr s1, vlenb
+; RV64ZVBC32-NEXT:    slli s1, s1, 1
+; RV64ZVBC32-NEXT:    mv s2, s1
+; RV64ZVBC32-NEXT:    slli s1, s1, 1
+; RV64ZVBC32-NEXT:    add s2, s2, s1
+; RV64ZVBC32-NEXT:    slli s1, s1, 1
+; RV64ZVBC32-NEXT:    add s2, s2, s1
+; RV64ZVBC32-NEXT:    slli s1, s1, 1
+; RV64ZVBC32-NEXT:    add s1, s1, s2
+; RV64ZVBC32-NEXT:    add s1, sp, s1
+; RV64ZVBC32-NEXT:    addi s1, s1, 96
+; RV64ZVBC32-NEXT:    vs1r.v v9, (s1) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld s1, 56(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v9, v11, s1
+; RV64ZVBC32-NEXT:    csrr s1, vlenb
+; RV64ZVBC32-NEXT:    mv s2, s1
+; RV64ZVBC32-NEXT:    slli s1, s1, 2
+; RV64ZVBC32-NEXT:    add s2, s2, s1
+; RV64ZVBC32-NEXT:    slli s1, s1, 1
+; RV64ZVBC32-NEXT:    add s2, s2, s1
+; RV64ZVBC32-NEXT:    slli s1, s1, 1
+; RV64ZVBC32-NEXT:    add s1, s1, s2
+; RV64ZVBC32-NEXT:    add s1, sp, s1
+; RV64ZVBC32-NEXT:    addi s1, s1, 96
+; RV64ZVBC32-NEXT:    vs1r.v v9, (s1) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld s1, 48(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v9, v11, s1
+; RV64ZVBC32-NEXT:    csrr s1, vlenb
+; RV64ZVBC32-NEXT:    slli s1, s1, 2
+; RV64ZVBC32-NEXT:    mv s2, s1
+; RV64ZVBC32-NEXT:    slli s1, s1, 1
+; RV64ZVBC32-NEXT:    add s2, s2, s1
+; RV64ZVBC32-NEXT:    slli s1, s1, 1
+; RV64ZVBC32-NEXT:    add s1, s1, s2
+; RV64ZVBC32-NEXT:    add s1, sp, s1
+; RV64ZVBC32-NEXT:    addi s1, s1, 96
+; RV64ZVBC32-NEXT:    vs1r.v v9, (s1) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld s1, 40(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v9, v11, s1
+; RV64ZVBC32-NEXT:    csrr s1, vlenb
+; RV64ZVBC32-NEXT:    mv s2, s1
+; RV64ZVBC32-NEXT:    slli s1, s1, 1
+; RV64ZVBC32-NEXT:    add s2, s2, s1
+; RV64ZVBC32-NEXT:    slli s1, s1, 2
+; RV64ZVBC32-NEXT:    add s2, s2, s1
+; RV64ZVBC32-NEXT:    slli s1, s1, 1
+; RV64ZVBC32-NEXT:    add s1, s1, s2
+; RV64ZVBC32-NEXT:    add s1, sp, s1
+; RV64ZVBC32-NEXT:    addi s1, s1, 96
+; RV64ZVBC32-NEXT:    vs1r.v v9, (s1) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld s1, 32(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v9, v11, s1
+; RV64ZVBC32-NEXT:    csrr s1, vlenb
+; RV64ZVBC32-NEXT:    slli s1, s1, 1
+; RV64ZVBC32-NEXT:    mv s2, s1
+; RV64ZVBC32-NEXT:    slli s1, s1, 2
+; RV64ZVBC32-NEXT:    add s2, s2, s1
+; RV64ZVBC32-NEXT:    slli s1, s1, 1
+; RV64ZVBC32-NEXT:    add s1, s1, s2
+; RV64ZVBC32-NEXT:    add s1, sp, s1
+; RV64ZVBC32-NEXT:    addi s1, s1, 96
+; RV64ZVBC32-NEXT:    vs1r.v v9, (s1) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v9, v11, s1
+; RV64ZVBC32-NEXT:    csrr s1, vlenb
+; RV64ZVBC32-NEXT:    mv s2, s1
+; RV64ZVBC32-NEXT:    slli s1, s1, 3
+; RV64ZVBC32-NEXT:    add s2, s2, s1
+; RV64ZVBC32-NEXT:    slli s1, s1, 1
+; RV64ZVBC32-NEXT:    add s1, s1, s2
+; RV64ZVBC32-NEXT:    ld s2, 8(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    add s1, sp, s1
+; RV64ZVBC32-NEXT:    addi s1, s1, 96
+; RV64ZVBC32-NEXT:    vs1r.v v9, (s1) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v9, v11, s2
+; RV64ZVBC32-NEXT:    csrr s1, vlenb
+; RV64ZVBC32-NEXT:    slli s1, s1, 3
+; RV64ZVBC32-NEXT:    mv s2, s1
+; RV64ZVBC32-NEXT:    slli s1, s1, 1
+; RV64ZVBC32-NEXT:    add s1, s1, s2
+; RV64ZVBC32-NEXT:    add s1, sp, s1
+; RV64ZVBC32-NEXT:    addi s1, s1, 96
+; RV64ZVBC32-NEXT:    vs1r.v v9, (s1) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v9, v11, s3
+; RV64ZVBC32-NEXT:    csrr s1, vlenb
+; RV64ZVBC32-NEXT:    mv s2, s1
+; RV64ZVBC32-NEXT:    slli s1, s1, 1
+; RV64ZVBC32-NEXT:    add s2, s2, s1
+; RV64ZVBC32-NEXT:    slli s1, s1, 1
+; RV64ZVBC32-NEXT:    add s2, s2, s1
+; RV64ZVBC32-NEXT:    slli s1, s1, 2
+; RV64ZVBC32-NEXT:    add s1, s1, s2
+; RV64ZVBC32-NEXT:    add s1, sp, s1
+; RV64ZVBC32-NEXT:    addi s1, s1, 96
+; RV64ZVBC32-NEXT:    vs1r.v v9, (s1) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v9, v11, s4
+; RV64ZVBC32-NEXT:    csrr s1, vlenb
+; RV64ZVBC32-NEXT:    slli s1, s1, 1
+; RV64ZVBC32-NEXT:    mv s2, s1
+; RV64ZVBC32-NEXT:    slli s1, s1, 1
+; RV64ZVBC32-NEXT:    add s2, s2, s1
+; RV64ZVBC32-NEXT:    slli s1, s1, 2
+; RV64ZVBC32-NEXT:    add s1, s1, s2
+; RV64ZVBC32-NEXT:    add s1, sp, s1
+; RV64ZVBC32-NEXT:    addi s1, s1, 96
+; RV64ZVBC32-NEXT:    vs1r.v v9, (s1) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v9, v11, s5
+; RV64ZVBC32-NEXT:    csrr s1, vlenb
+; RV64ZVBC32-NEXT:    mv s2, s1
+; RV64ZVBC32-NEXT:    slli s1, s1, 2
+; RV64ZVBC32-NEXT:    add s2, s2, s1
+; RV64ZVBC32-NEXT:    slli s1, s1, 2
+; RV64ZVBC32-NEXT:    add s1, s1, s2
+; RV64ZVBC32-NEXT:    add s1, sp, s1
+; RV64ZVBC32-NEXT:    addi s1, s1, 96
+; RV64ZVBC32-NEXT:    vs1r.v v9, (s1) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v9, v11, s6
+; RV64ZVBC32-NEXT:    csrr s1, vlenb
+; RV64ZVBC32-NEXT:    slli s1, s1, 2
+; RV64ZVBC32-NEXT:    mv s2, s1
+; RV64ZVBC32-NEXT:    slli s1, s1, 2
+; RV64ZVBC32-NEXT:    add s1, s1, s2
+; RV64ZVBC32-NEXT:    add s1, sp, s1
+; RV64ZVBC32-NEXT:    addi s1, s1, 96
+; RV64ZVBC32-NEXT:    vs1r.v v9, (s1) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v9, v11, s7
+; RV64ZVBC32-NEXT:    csrr s1, vlenb
+; RV64ZVBC32-NEXT:    mv s2, s1
+; RV64ZVBC32-NEXT:    slli s1, s1, 1
+; RV64ZVBC32-NEXT:    add s2, s2, s1
+; RV64ZVBC32-NEXT:    slli s1, s1, 3
+; RV64ZVBC32-NEXT:    add s1, s1, s2
+; RV64ZVBC32-NEXT:    add s1, sp, s1
+; RV64ZVBC32-NEXT:    addi s1, s1, 96
+; RV64ZVBC32-NEXT:    vs1r.v v9, (s1) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v9, v11, s8
+; RV64ZVBC32-NEXT:    csrr s1, vlenb
+; RV64ZVBC32-NEXT:    slli s1, s1, 1
+; RV64ZVBC32-NEXT:    mv s2, s1
+; RV64ZVBC32-NEXT:    slli s1, s1, 3
+; RV64ZVBC32-NEXT:    add s1, s1, s2
+; RV64ZVBC32-NEXT:    add s1, sp, s1
+; RV64ZVBC32-NEXT:    addi s1, s1, 96
+; RV64ZVBC32-NEXT:    vs1r.v v9, (s1) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v9, v11, s10
+; RV64ZVBC32-NEXT:    csrr s1, vlenb
+; RV64ZVBC32-NEXT:    slli s2, s1, 4
+; RV64ZVBC32-NEXT:    add s1, s2, s1
+; RV64ZVBC32-NEXT:    add s1, sp, s1
+; RV64ZVBC32-NEXT:    addi s1, s1, 96
+; RV64ZVBC32-NEXT:    vs1r.v v9, (s1) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v9, v11, s11
+; RV64ZVBC32-NEXT:    csrr s1, vlenb
+; RV64ZVBC32-NEXT:    slli s1, s1, 4
+; RV64ZVBC32-NEXT:    add s1, sp, s1
+; RV64ZVBC32-NEXT:    addi s1, s1, 96
+; RV64ZVBC32-NEXT:    vs1r.v v9, (s1) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v9, v11, ra
+; RV64ZVBC32-NEXT:    csrr s1, vlenb
+; RV64ZVBC32-NEXT:    slli s2, s1, 4
+; RV64ZVBC32-NEXT:    sub s1, s2, s1
+; RV64ZVBC32-NEXT:    add s1, sp, s1
+; RV64ZVBC32-NEXT:    addi s1, s1, 96
+; RV64ZVBC32-NEXT:    vs1r.v v9, (s1) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v9, v11, t2
+; RV64ZVBC32-NEXT:    csrr t2, vlenb
+; RV64ZVBC32-NEXT:    slli t2, t2, 1
+; RV64ZVBC32-NEXT:    mv s1, t2
+; RV64ZVBC32-NEXT:    slli t2, t2, 1
+; RV64ZVBC32-NEXT:    add s1, s1, t2
+; RV64ZVBC32-NEXT:    slli t2, t2, 1
+; RV64ZVBC32-NEXT:    add t2, t2, s1
+; RV64ZVBC32-NEXT:    add t2, sp, t2
+; RV64ZVBC32-NEXT:    addi t2, t2, 96
+; RV64ZVBC32-NEXT:    vs1r.v v9, (t2) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v9, v11, t1
+; RV64ZVBC32-NEXT:    csrr t1, vlenb
+; RV64ZVBC32-NEXT:    mv t2, t1
+; RV64ZVBC32-NEXT:    slli t1, t1, 2
+; RV64ZVBC32-NEXT:    add t2, t2, t1
+; RV64ZVBC32-NEXT:    slli t1, t1, 1
+; RV64ZVBC32-NEXT:    add t1, t1, t2
+; RV64ZVBC32-NEXT:    add t1, sp, t1
+; RV64ZVBC32-NEXT:    addi t1, t1, 96
+; RV64ZVBC32-NEXT:    vs1r.v v9, (t1) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v9, v11, a7
+; RV64ZVBC32-NEXT:    csrr a7, vlenb
+; RV64ZVBC32-NEXT:    slli a7, a7, 2
+; RV64ZVBC32-NEXT:    mv t1, a7
+; RV64ZVBC32-NEXT:    slli a7, a7, 1
+; RV64ZVBC32-NEXT:    add a7, a7, t1
+; RV64ZVBC32-NEXT:    add a7, sp, a7
+; RV64ZVBC32-NEXT:    addi a7, a7, 96
+; RV64ZVBC32-NEXT:    vs1r.v v9, (a7) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v9, v11, s9
+; RV64ZVBC32-NEXT:    csrr a7, vlenb
+; RV64ZVBC32-NEXT:    mv t1, a7
+; RV64ZVBC32-NEXT:    slli a7, a7, 1
+; RV64ZVBC32-NEXT:    add t1, t1, a7
+; RV64ZVBC32-NEXT:    slli a7, a7, 2
+; RV64ZVBC32-NEXT:    add a7, a7, t1
+; RV64ZVBC32-NEXT:    add a7, sp, a7
+; RV64ZVBC32-NEXT:    addi a7, a7, 96
+; RV64ZVBC32-NEXT:    vs1r.v v9, (a7) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v9, v11, a6
+; RV64ZVBC32-NEXT:    csrr a6, vlenb
+; RV64ZVBC32-NEXT:    slli a6, a6, 1
+; RV64ZVBC32-NEXT:    mv a7, a6
+; RV64ZVBC32-NEXT:    slli a6, a6, 2
+; RV64ZVBC32-NEXT:    add a6, a6, a7
+; RV64ZVBC32-NEXT:    add a6, sp, a6
+; RV64ZVBC32-NEXT:    addi a6, a6, 96
+; RV64ZVBC32-NEXT:    vs1r.v v9, (a6) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v9, v11, a5
+; RV64ZVBC32-NEXT:    csrr a5, vlenb
+; RV64ZVBC32-NEXT:    slli a6, a5, 3
+; RV64ZVBC32-NEXT:    add a5, a6, a5
+; RV64ZVBC32-NEXT:    add a5, sp, a5
+; RV64ZVBC32-NEXT:    addi a5, a5, 96
+; RV64ZVBC32-NEXT:    vs1r.v v9, (a5) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v9, v11, a4
+; RV64ZVBC32-NEXT:    csrr a4, vlenb
+; RV64ZVBC32-NEXT:    slli a4, a4, 3
+; RV64ZVBC32-NEXT:    add a4, sp, a4
+; RV64ZVBC32-NEXT:    addi a4, a4, 96
+; RV64ZVBC32-NEXT:    vs1r.v v9, (a4) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v9, v11, a2
+; RV64ZVBC32-NEXT:    csrr a2, vlenb
+; RV64ZVBC32-NEXT:    slli a4, a2, 3
+; RV64ZVBC32-NEXT:    sub a2, a4, a2
+; RV64ZVBC32-NEXT:    add a2, sp, a2
+; RV64ZVBC32-NEXT:    addi a2, a2, 96
+; RV64ZVBC32-NEXT:    vs1r.v v9, (a2) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v9, v11, a1
+; RV64ZVBC32-NEXT:    csrr a1, vlenb
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    mv a2, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a2
+; RV64ZVBC32-NEXT:    add a1, sp, a1
+; RV64ZVBC32-NEXT:    addi a1, a1, 96
+; RV64ZVBC32-NEXT:    vs1r.v v9, (a1) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vi v9, v11, 2
+; RV64ZVBC32-NEXT:    vand.vi v10, v11, 1
+; RV64ZVBC32-NEXT:    vand.vi v12, v11, 4
+; RV64ZVBC32-NEXT:    vand.vi v13, v11, 8
+; RV64ZVBC32-NEXT:    vand.vx v5, v11, a3
+; RV64ZVBC32-NEXT:    vand.vx v4, v11, t0
+; RV64ZVBC32-NEXT:    vand.vx v2, v11, a0
+; RV64ZVBC32-NEXT:    vmul.vv v3, v8, v9
+; RV64ZVBC32-NEXT:    vmul.vv v9, v8, v10
+; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v12
+; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v13
+; RV64ZVBC32-NEXT:    vmul.vv v12, v8, v14
+; RV64ZVBC32-NEXT:    vmul.vv v13, v8, v15
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v17
+; RV64ZVBC32-NEXT:    vmul.vv v15, v8, v18
+; RV64ZVBC32-NEXT:    vmul.vv v16, v8, v19
+; RV64ZVBC32-NEXT:    vmul.vv v17, v8, v20
+; RV64ZVBC32-NEXT:    vmul.vv v18, v8, v21
+; RV64ZVBC32-NEXT:    vmul.vv v19, v8, v22
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v23
+; RV64ZVBC32-NEXT:    vmul.vv v21, v8, v24
+; RV64ZVBC32-NEXT:    vmul.vv v22, v8, v25
+; RV64ZVBC32-NEXT:    vmul.vv v23, v8, v26
+; RV64ZVBC32-NEXT:    vmul.vv v24, v8, v27
+; RV64ZVBC32-NEXT:    vmul.vv v25, v8, v28
+; RV64ZVBC32-NEXT:    vmul.vv v26, v8, v29
+; RV64ZVBC32-NEXT:    vmul.vv v27, v8, v30
+; RV64ZVBC32-NEXT:    vmul.vv v28, v8, v31
+; RV64ZVBC32-NEXT:    vmul.vv v29, v8, v7
+; RV64ZVBC32-NEXT:    vmul.vv v30, v8, v6
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vl1r.v v31, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v31, v8, v31
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vl1r.v v7, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v7, v8, v7
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vl1r.v v6, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v6, v8, v6
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v1, v8, v0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a1, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a1, a0
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v1, v8, v0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a1, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a1, a0
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV64ZVBC32-NEXT:    addi a0, sp, 96
+; RV64ZVBC32-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a1, a0, 5
+; RV64ZVBC32-NEXT:    add a0, a1, a0
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a1, a0, 5
+; RV64ZVBC32-NEXT:    add a0, a1, a0
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 5
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 5
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a1, a0, 5
+; RV64ZVBC32-NEXT:    sub a0, a1, a0
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a1, a0, 5
+; RV64ZVBC32-NEXT:    sub a0, a1, a0
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a1, a0, 4
+; RV64ZVBC32-NEXT:    add a0, a1, a0
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a1, a0, 4
+; RV64ZVBC32-NEXT:    add a0, a1, a0
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a1, a0, 4
+; RV64ZVBC32-NEXT:    sub a0, a1, a0
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a1, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a1, a0
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a1, a0, 3
+; RV64ZVBC32-NEXT:    sub a0, a1, a0
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 96
+; RV64ZVBC32-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vmul.vv v1, v8, v5
+; RV64ZVBC32-NEXT:    vmul.vv v5, v8, v4
+; RV64ZVBC32-NEXT:    vmul.vv v8, v8, v2
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v3
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v12
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v13
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v14
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v15
+; RV64ZVBC32-NEXT:    vxor.vv v10, v9, v16
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v17
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v18
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v19
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v20
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v21
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v22
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v23
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v24
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v25
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v26
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v27
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v28
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v29
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v30
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v31
+; RV64ZVBC32-NEXT:    vxor.vv v11, v10, v7
+; RV64ZVBC32-NEXT:    vxor.vv v11, v11, v6
+; RV64ZVBC32-NEXT:    li a0, 56
+; RV64ZVBC32-NEXT:    vsll.vx v9, v9, a0
+; RV64ZVBC32-NEXT:    vand.vx v10, v10, t4
+; RV64ZVBC32-NEXT:    li a1, 40
+; RV64ZVBC32-NEXT:    vsll.vx v10, v10, a1
+; RV64ZVBC32-NEXT:    vor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    csrr a2, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a2, 2
+; RV64ZVBC32-NEXT:    add a2, a3, a2
+; RV64ZVBC32-NEXT:    add a2, sp, a2
+; RV64ZVBC32-NEXT:    addi a2, a2, 96
+; RV64ZVBC32-NEXT:    vl1r.v v10, (a2) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v10, v11, v10
+; RV64ZVBC32-NEXT:    csrr a2, vlenb
+; RV64ZVBC32-NEXT:    slli a2, a2, 2
+; RV64ZVBC32-NEXT:    add a2, sp, a2
+; RV64ZVBC32-NEXT:    addi a2, a2, 96
+; RV64ZVBC32-NEXT:    vl1r.v v11, (a2) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v0
+; RV64ZVBC32-NEXT:    csrr a2, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a2, 1
+; RV64ZVBC32-NEXT:    add a2, a3, a2
+; RV64ZVBC32-NEXT:    add a2, sp, a2
+; RV64ZVBC32-NEXT:    addi a2, a2, 96
+; RV64ZVBC32-NEXT:    vl1r.v v11, (a2) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    csrr a2, vlenb
+; RV64ZVBC32-NEXT:    slli a2, a2, 1
+; RV64ZVBC32-NEXT:    add a2, sp, a2
+; RV64ZVBC32-NEXT:    addi a2, a2, 96
+; RV64ZVBC32-NEXT:    vl1r.v v11, (a2) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    csrr a2, vlenb
+; RV64ZVBC32-NEXT:    add a2, sp, a2
+; RV64ZVBC32-NEXT:    addi a2, a2, 96
+; RV64ZVBC32-NEXT:    vl1r.v v11, (a2) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    addi a2, sp, 96
+; RV64ZVBC32-NEXT:    vl1r.v v11, (a2) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    csrr a2, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a2, 5
+; RV64ZVBC32-NEXT:    add a2, a3, a2
+; RV64ZVBC32-NEXT:    add a2, sp, a2
+; RV64ZVBC32-NEXT:    addi a2, a2, 96
+; RV64ZVBC32-NEXT:    vl1r.v v11, (a2) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    csrr a2, vlenb
+; RV64ZVBC32-NEXT:    slli a2, a2, 5
+; RV64ZVBC32-NEXT:    add a2, sp, a2
+; RV64ZVBC32-NEXT:    addi a2, a2, 96
+; RV64ZVBC32-NEXT:    vl1r.v v11, (a2) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    csrr a2, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a2, 5
+; RV64ZVBC32-NEXT:    sub a2, a3, a2
+; RV64ZVBC32-NEXT:    add a2, sp, a2
+; RV64ZVBC32-NEXT:    addi a2, a2, 96
+; RV64ZVBC32-NEXT:    vl1r.v v11, (a2) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    csrr a2, vlenb
+; RV64ZVBC32-NEXT:    slli a2, a2, 1
+; RV64ZVBC32-NEXT:    mv a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 1
+; RV64ZVBC32-NEXT:    add a2, a2, a3
+; RV64ZVBC32-NEXT:    add a2, sp, a2
+; RV64ZVBC32-NEXT:    addi a2, a2, 96
+; RV64ZVBC32-NEXT:    vl1r.v v11, (a2) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    csrr a2, vlenb
+; RV64ZVBC32-NEXT:    mv a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 2
+; RV64ZVBC32-NEXT:    add a3, a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 1
+; RV64ZVBC32-NEXT:    add a2, a2, a3
+; RV64ZVBC32-NEXT:    add a2, sp, a2
+; RV64ZVBC32-NEXT:    addi a2, a2, 96
+; RV64ZVBC32-NEXT:    vl1r.v v11, (a2) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    csrr a2, vlenb
+; RV64ZVBC32-NEXT:    slli a2, a2, 2
+; RV64ZVBC32-NEXT:    mv a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 1
+; RV64ZVBC32-NEXT:    add a2, a2, a3
+; RV64ZVBC32-NEXT:    add a2, sp, a2
+; RV64ZVBC32-NEXT:    addi a2, a2, 96
+; RV64ZVBC32-NEXT:    vl1r.v v11, (a2) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    csrr a2, vlenb
+; RV64ZVBC32-NEXT:    mv a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 2
+; RV64ZVBC32-NEXT:    add a3, a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 1
+; RV64ZVBC32-NEXT:    add a2, a2, a3
+; RV64ZVBC32-NEXT:    add a2, sp, a2
+; RV64ZVBC32-NEXT:    addi a2, a2, 96
+; RV64ZVBC32-NEXT:    vl1r.v v11, (a2) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    csrr a2, vlenb
+; RV64ZVBC32-NEXT:    slli a2, a2, 1
+; RV64ZVBC32-NEXT:    mv a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 2
+; RV64ZVBC32-NEXT:    add a3, a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 1
+; RV64ZVBC32-NEXT:    add a2, a2, a3
+; RV64ZVBC32-NEXT:    add a2, sp, a2
+; RV64ZVBC32-NEXT:    addi a2, a2, 96
+; RV64ZVBC32-NEXT:    vl1r.v v11, (a2) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    csrr a2, vlenb
+; RV64ZVBC32-NEXT:    mv a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 3
+; RV64ZVBC32-NEXT:    add a3, a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 1
+; RV64ZVBC32-NEXT:    add a2, a2, a3
+; RV64ZVBC32-NEXT:    add a2, sp, a2
+; RV64ZVBC32-NEXT:    addi a2, a2, 96
+; RV64ZVBC32-NEXT:    vl1r.v v12, (a2) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v11, v11, v12
+; RV64ZVBC32-NEXT:    csrr a2, vlenb
+; RV64ZVBC32-NEXT:    slli a2, a2, 3
+; RV64ZVBC32-NEXT:    mv a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 1
+; RV64ZVBC32-NEXT:    add a2, a2, a3
+; RV64ZVBC32-NEXT:    add a2, sp, a2
+; RV64ZVBC32-NEXT:    addi a2, a2, 96
+; RV64ZVBC32-NEXT:    vl1r.v v12, (a2) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v11, v11, v12
+; RV64ZVBC32-NEXT:    csrr a2, vlenb
+; RV64ZVBC32-NEXT:    mv a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 2
+; RV64ZVBC32-NEXT:    add a2, a2, a3
+; RV64ZVBC32-NEXT:    add a2, sp, a2
+; RV64ZVBC32-NEXT:    addi a2, a2, 96
+; RV64ZVBC32-NEXT:    vl1r.v v12, (a2) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v11, v11, v12
+; RV64ZVBC32-NEXT:    csrr a2, vlenb
+; RV64ZVBC32-NEXT:    slli a2, a2, 1
+; RV64ZVBC32-NEXT:    mv a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 2
+; RV64ZVBC32-NEXT:    add a2, a2, a3
+; RV64ZVBC32-NEXT:    add a2, sp, a2
+; RV64ZVBC32-NEXT:    addi a2, a2, 96
+; RV64ZVBC32-NEXT:    vl1r.v v12, (a2) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v11, v11, v12
+; RV64ZVBC32-NEXT:    csrr a2, vlenb
+; RV64ZVBC32-NEXT:    mv a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 2
+; RV64ZVBC32-NEXT:    add a3, a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 2
+; RV64ZVBC32-NEXT:    add a2, a2, a3
+; RV64ZVBC32-NEXT:    add a2, sp, a2
+; RV64ZVBC32-NEXT:    addi a2, a2, 96
+; RV64ZVBC32-NEXT:    vl1r.v v12, (a2) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v11, v11, v12
+; RV64ZVBC32-NEXT:    csrr a2, vlenb
+; RV64ZVBC32-NEXT:    slli a2, a2, 2
+; RV64ZVBC32-NEXT:    mv a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 2
+; RV64ZVBC32-NEXT:    add a2, a2, a3
+; RV64ZVBC32-NEXT:    add a2, sp, a2
+; RV64ZVBC32-NEXT:    addi a2, a2, 96
+; RV64ZVBC32-NEXT:    vl1r.v v12, (a2) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v11, v11, v12
+; RV64ZVBC32-NEXT:    csrr a2, vlenb
+; RV64ZVBC32-NEXT:    mv a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 3
+; RV64ZVBC32-NEXT:    add a2, a2, a3
+; RV64ZVBC32-NEXT:    add a2, sp, a2
+; RV64ZVBC32-NEXT:    addi a2, a2, 96
+; RV64ZVBC32-NEXT:    vl1r.v v12, (a2) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v11, v11, v12
+; RV64ZVBC32-NEXT:    csrr a2, vlenb
+; RV64ZVBC32-NEXT:    slli a2, a2, 1
+; RV64ZVBC32-NEXT:    mv a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 3
+; RV64ZVBC32-NEXT:    add a2, a2, a3
+; RV64ZVBC32-NEXT:    add a2, sp, a2
+; RV64ZVBC32-NEXT:    addi a2, a2, 96
+; RV64ZVBC32-NEXT:    vl1r.v v12, (a2) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v12, v11, v12
+; RV64ZVBC32-NEXT:    csrr a2, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a2, 4
+; RV64ZVBC32-NEXT:    add a2, a3, a2
+; RV64ZVBC32-NEXT:    add a2, sp, a2
+; RV64ZVBC32-NEXT:    addi a2, a2, 96
+; RV64ZVBC32-NEXT:    vl1r.v v13, (a2) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v13
+; RV64ZVBC32-NEXT:    csrr a2, vlenb
+; RV64ZVBC32-NEXT:    slli a2, a2, 4
+; RV64ZVBC32-NEXT:    add a2, sp, a2
+; RV64ZVBC32-NEXT:    addi a2, a2, 96
+; RV64ZVBC32-NEXT:    vl1r.v v13, (a2) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v13
+; RV64ZVBC32-NEXT:    vsrl.vi v13, v10, 8
+; RV64ZVBC32-NEXT:    vand.vx v13, v13, t3
+; RV64ZVBC32-NEXT:    vsrl.vi v11, v11, 24
+; RV64ZVBC32-NEXT:    lui a2, 4080
+; RV64ZVBC32-NEXT:    vand.vx v11, v11, a2
+; RV64ZVBC32-NEXT:    vor.vv v11, v13, v11
+; RV64ZVBC32-NEXT:    csrr a3, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    mv a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 3
+; RV64ZVBC32-NEXT:    add a3, a3, a4
+; RV64ZVBC32-NEXT:    add a3, sp, a3
+; RV64ZVBC32-NEXT:    addi a3, a3, 96
+; RV64ZVBC32-NEXT:    vl1r.v v13, (a3) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v13
+; RV64ZVBC32-NEXT:    csrr a3, vlenb
+; RV64ZVBC32-NEXT:    mv a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 2
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 3
+; RV64ZVBC32-NEXT:    add a3, a3, a4
+; RV64ZVBC32-NEXT:    add a3, sp, a3
+; RV64ZVBC32-NEXT:    addi a3, a3, 96
+; RV64ZVBC32-NEXT:    vl1r.v v13, (a3) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v13
+; RV64ZVBC32-NEXT:    csrr a3, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a3, 2
+; RV64ZVBC32-NEXT:    mv a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 3
+; RV64ZVBC32-NEXT:    add a3, a3, a4
+; RV64ZVBC32-NEXT:    add a3, sp, a3
+; RV64ZVBC32-NEXT:    addi a3, a3, 96
+; RV64ZVBC32-NEXT:    vl1r.v v13, (a3) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v13
+; RV64ZVBC32-NEXT:    csrr a3, vlenb
+; RV64ZVBC32-NEXT:    mv a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 4
+; RV64ZVBC32-NEXT:    add a3, a3, a4
+; RV64ZVBC32-NEXT:    add a3, sp, a3
+; RV64ZVBC32-NEXT:    addi a3, a3, 96
+; RV64ZVBC32-NEXT:    vl1r.v v13, (a3) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v13
+; RV64ZVBC32-NEXT:    csrr a3, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    mv a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 4
+; RV64ZVBC32-NEXT:    add a3, a3, a4
+; RV64ZVBC32-NEXT:    add a3, sp, a3
+; RV64ZVBC32-NEXT:    addi a3, a3, 96
+; RV64ZVBC32-NEXT:    vl1r.v v13, (a3) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v13
+; RV64ZVBC32-NEXT:    vand.vx v10, v10, a2
+; RV64ZVBC32-NEXT:    vsll.vi v10, v10, 24
+; RV64ZVBC32-NEXT:    csrr a2, vlenb
+; RV64ZVBC32-NEXT:    slli a2, a2, 3
+; RV64ZVBC32-NEXT:    mv a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 2
+; RV64ZVBC32-NEXT:    add a2, a2, a3
+; RV64ZVBC32-NEXT:    add a2, sp, a2
+; RV64ZVBC32-NEXT:    addi a2, a2, 96
+; RV64ZVBC32-NEXT:    vl1r.v v13, (a2) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v13, v12, v13
+; RV64ZVBC32-NEXT:    csrr a2, vlenb
+; RV64ZVBC32-NEXT:    mv a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 3
+; RV64ZVBC32-NEXT:    add a2, a2, a3
+; RV64ZVBC32-NEXT:    add a2, sp, a2
+; RV64ZVBC32-NEXT:    addi a2, a2, 96
+; RV64ZVBC32-NEXT:    vl1r.v v14, (a2) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v13, v13, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v12, t3
+; RV64ZVBC32-NEXT:    vsll.vi v14, v14, 8
+; RV64ZVBC32-NEXT:    vor.vv v10, v10, v14
+; RV64ZVBC32-NEXT:    csrr a2, vlenb
+; RV64ZVBC32-NEXT:    mv a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 3
+; RV64ZVBC32-NEXT:    add a3, a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 2
+; RV64ZVBC32-NEXT:    add a2, a2, a3
+; RV64ZVBC32-NEXT:    add a2, sp, a2
+; RV64ZVBC32-NEXT:    addi a2, a2, 96
+; RV64ZVBC32-NEXT:    vl1r.v v14, (a2) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v13, v13, v14
+; RV64ZVBC32-NEXT:    vor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    csrr a2, vlenb
+; RV64ZVBC32-NEXT:    slli a2, a2, 1
+; RV64ZVBC32-NEXT:    mv a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 2
+; RV64ZVBC32-NEXT:    add a3, a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 2
+; RV64ZVBC32-NEXT:    add a2, a2, a3
+; RV64ZVBC32-NEXT:    add a2, sp, a2
+; RV64ZVBC32-NEXT:    addi a2, a2, 96
+; RV64ZVBC32-NEXT:    vl1r.v v10, (a2) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v10, v13, v10
+; RV64ZVBC32-NEXT:    vsrl.vx v12, v12, a1
+; RV64ZVBC32-NEXT:    vand.vx v12, v12, t4
+; RV64ZVBC32-NEXT:    csrr a1, vlenb
+; RV64ZVBC32-NEXT:    mv a2, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    add a2, a2, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 2
+; RV64ZVBC32-NEXT:    add a2, a2, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a2
+; RV64ZVBC32-NEXT:    add a1, sp, a1
+; RV64ZVBC32-NEXT:    addi a1, a1, 96
+; RV64ZVBC32-NEXT:    vl1r.v v13, (a1) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v13
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v1
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v5
+; RV64ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    vsrl.vx v8, v8, a0
+; RV64ZVBC32-NEXT:    vor.vv v8, v12, v8
+; RV64ZVBC32-NEXT:    vor.vv v8, v11, v8
+; RV64ZVBC32-NEXT:    vor.vv v8, v9, v8
+; RV64ZVBC32-NEXT:    vsrl.vi v9, v8, 4
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, t5
+; RV64ZVBC32-NEXT:    vand.vx v9, v9, t5
+; RV64ZVBC32-NEXT:    vsll.vi v8, v8, 4
+; RV64ZVBC32-NEXT:    vor.vv v8, v9, v8
+; RV64ZVBC32-NEXT:    vsrl.vi v9, v8, 2
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, t6
+; RV64ZVBC32-NEXT:    vand.vx v9, v9, t6
+; RV64ZVBC32-NEXT:    vsll.vi v8, v8, 2
+; RV64ZVBC32-NEXT:    vor.vv v8, v9, v8
+; RV64ZVBC32-NEXT:    vsrl.vi v9, v8, 1
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, s0
+; RV64ZVBC32-NEXT:    vand.vx v9, v9, s0
+; RV64ZVBC32-NEXT:    vadd.vv v8, v8, v8
+; RV64ZVBC32-NEXT:    vor.vv v8, v9, v8
+; RV64ZVBC32-NEXT:    vsrl.vi v8, v8, 1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add sp, sp, a0
+; RV64ZVBC32-NEXT:    ld ra, 200(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    ld s0, 192(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    ld s1, 184(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    ld s2, 176(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    ld s3, 168(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    ld s4, 160(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    ld s5, 152(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    ld s6, 144(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    ld s7, 136(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    ld s8, 128(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    ld s9, 120(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    ld s10, 112(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    ld s11, 104(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    addi sp, sp, 208
+; RV64ZVBC32-NEXT:    ret
   %va.ext = zext <vscale x 1 x i64> %va to <vscale x 1 x i128>
   %vb.ext = zext <vscale x 1 x i64> %vb to <vscale x 1 x i128>
   %clmul = call <vscale x 1 x i128> @llvm.clmul.nxv1i128(<vscale x 1 x i128> %va.ext, <vscale x 1 x i128> %vb.ext)
@@ -14331,23 +20474,935 @@ define <vscale x 1 x i64> @clmulh_nxv1i64_vx(<vscale x 1 x i64> %va, i64 %b) nou
 ; RV64V-NEXT:    vsrl.vi v8, v8, 1
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmulh_nxv1i64_vx:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    addi sp, sp, -16
-; RV32ZVBC-NEXT:    sw a0, 8(sp)
-; RV32ZVBC-NEXT:    sw a1, 12(sp)
-; RV32ZVBC-NEXT:    addi a0, sp, 8
-; RV32ZVBC-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32ZVBC-NEXT:    vlse64.v v9, (a0), zero
-; RV32ZVBC-NEXT:    vclmulh.vv v8, v8, v9
-; RV32ZVBC-NEXT:    addi sp, sp, 16
-; RV32ZVBC-NEXT:    ret
+; RV32ZVBC64-LABEL: clmulh_nxv1i64_vx:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    addi sp, sp, -16
+; RV32ZVBC64-NEXT:    sw a0, 8(sp)
+; RV32ZVBC64-NEXT:    sw a1, 12(sp)
+; RV32ZVBC64-NEXT:    addi a0, sp, 8
+; RV32ZVBC64-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32ZVBC64-NEXT:    vlse64.v v9, (a0), zero
+; RV32ZVBC64-NEXT:    vclmulh.vv v8, v8, v9
+; RV32ZVBC64-NEXT:    addi sp, sp, 16
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmulh_nxv1i64_vx:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV64ZVBC64-NEXT:    vclmulh.vx v8, v8, a0
+; RV64ZVBC64-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmulh_nxv1i64_vx:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV64ZVBC-NEXT:    vclmulh.vx v8, v8, a0
-; RV64ZVBC-NEXT:    ret
+; RV32ZVBC32-LABEL: clmulh_nxv1i64_vx:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    addi sp, sp, -368
+; RV32ZVBC32-NEXT:    sw ra, 364(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s0, 360(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s1, 356(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s2, 352(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s3, 348(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s4, 344(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s5, 340(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s6, 336(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s7, 332(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s8, 328(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s9, 324(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s10, 320(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s11, 316(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a2, vlenb
+; RV32ZVBC32-NEXT:    sub sp, sp, a2
+; RV32ZVBC32-NEXT:    vsetvli a2, zero, e64, m1, ta, ma
+; RV32ZVBC32-NEXT:    vmv1r.v v9, v8
+; RV32ZVBC32-NEXT:    sw a0, 16(sp)
+; RV32ZVBC32-NEXT:    sw a1, 20(sp)
+; RV32ZVBC32-NEXT:    addi t6, sp, 16
+; RV32ZVBC32-NEXT:    lui s0, 1044480
+; RV32ZVBC32-NEXT:    lui a4, 524288
+; RV32ZVBC32-NEXT:    li s11, 1
+; RV32ZVBC32-NEXT:    li s6, 2
+; RV32ZVBC32-NEXT:    li s7, 4
+; RV32ZVBC32-NEXT:    li s8, 8
+; RV32ZVBC32-NEXT:    li s10, 16
+; RV32ZVBC32-NEXT:    li s9, 32
+; RV32ZVBC32-NEXT:    li s1, 64
+; RV32ZVBC32-NEXT:    li s2, 128
+; RV32ZVBC32-NEXT:    li s3, 256
+; RV32ZVBC32-NEXT:    li s4, 512
+; RV32ZVBC32-NEXT:    li s5, 1024
+; RV32ZVBC32-NEXT:    lui a3, 1
+; RV32ZVBC32-NEXT:    lui ra, 2
+; RV32ZVBC32-NEXT:    lui a0, 4
+; RV32ZVBC32-NEXT:    lui a1, 8
+; RV32ZVBC32-NEXT:    lui a5, 16
+; RV32ZVBC32-NEXT:    lui a2, 32
+; RV32ZVBC32-NEXT:    lui a6, 64
+; RV32ZVBC32-NEXT:    lui a7, 128
+; RV32ZVBC32-NEXT:    lui t0, 256
+; RV32ZVBC32-NEXT:    lui t1, 512
+; RV32ZVBC32-NEXT:    lui t2, 1024
+; RV32ZVBC32-NEXT:    lui t3, 2048
+; RV32ZVBC32-NEXT:    lui t4, 4096
+; RV32ZVBC32-NEXT:    lui t5, 8192
+; RV32ZVBC32-NEXT:    vlse64.v v13, (t6), zero
+; RV32ZVBC32-NEXT:    lui t6, 16384
+; RV32ZVBC32-NEXT:    sw s0, 264(sp)
+; RV32ZVBC32-NEXT:    lui s0, 32768
+; RV32ZVBC32-NEXT:    sw zero, 268(sp)
+; RV32ZVBC32-NEXT:    sw a4, 24(sp)
+; RV32ZVBC32-NEXT:    sw zero, 28(sp)
+; RV32ZVBC32-NEXT:    sw zero, 288(sp)
+; RV32ZVBC32-NEXT:    sw s11, 292(sp)
+; RV32ZVBC32-NEXT:    sw zero, 272(sp)
+; RV32ZVBC32-NEXT:    sw s6, 276(sp)
+; RV32ZVBC32-NEXT:    lui s6, 65536
+; RV32ZVBC32-NEXT:    sw zero, 280(sp)
+; RV32ZVBC32-NEXT:    sw s7, 284(sp)
+; RV32ZVBC32-NEXT:    lui s7, 131072
+; RV32ZVBC32-NEXT:    sw zero, 256(sp)
+; RV32ZVBC32-NEXT:    sw s8, 260(sp)
+; RV32ZVBC32-NEXT:    lui s8, 262144
+; RV32ZVBC32-NEXT:    sw zero, 248(sp)
+; RV32ZVBC32-NEXT:    sw s10, 252(sp)
+; RV32ZVBC32-NEXT:    sw zero, 240(sp)
+; RV32ZVBC32-NEXT:    sw s9, 244(sp)
+; RV32ZVBC32-NEXT:    sw zero, 232(sp)
+; RV32ZVBC32-NEXT:    sw s1, 236(sp)
+; RV32ZVBC32-NEXT:    sw zero, 224(sp)
+; RV32ZVBC32-NEXT:    sw s2, 228(sp)
+; RV32ZVBC32-NEXT:    sw zero, 216(sp)
+; RV32ZVBC32-NEXT:    sw s3, 220(sp)
+; RV32ZVBC32-NEXT:    sw zero, 208(sp)
+; RV32ZVBC32-NEXT:    sw s4, 212(sp)
+; RV32ZVBC32-NEXT:    sw zero, 200(sp)
+; RV32ZVBC32-NEXT:    sw s5, 204(sp)
+; RV32ZVBC32-NEXT:    slli s11, s11, 11
+; RV32ZVBC32-NEXT:    sw zero, 192(sp)
+; RV32ZVBC32-NEXT:    sw s11, 196(sp)
+; RV32ZVBC32-NEXT:    sw zero, 184(sp)
+; RV32ZVBC32-NEXT:    sw a3, 188(sp)
+; RV32ZVBC32-NEXT:    sw zero, 176(sp)
+; RV32ZVBC32-NEXT:    sw ra, 180(sp)
+; RV32ZVBC32-NEXT:    sw zero, 168(sp)
+; RV32ZVBC32-NEXT:    sw a0, 172(sp)
+; RV32ZVBC32-NEXT:    sw zero, 160(sp)
+; RV32ZVBC32-NEXT:    sw a1, 164(sp)
+; RV32ZVBC32-NEXT:    sw zero, 152(sp)
+; RV32ZVBC32-NEXT:    sw a5, 156(sp)
+; RV32ZVBC32-NEXT:    sw zero, 144(sp)
+; RV32ZVBC32-NEXT:    sw a2, 148(sp)
+; RV32ZVBC32-NEXT:    sw zero, 136(sp)
+; RV32ZVBC32-NEXT:    sw a6, 140(sp)
+; RV32ZVBC32-NEXT:    sw zero, 128(sp)
+; RV32ZVBC32-NEXT:    sw a7, 132(sp)
+; RV32ZVBC32-NEXT:    sw zero, 120(sp)
+; RV32ZVBC32-NEXT:    sw t0, 124(sp)
+; RV32ZVBC32-NEXT:    sw zero, 112(sp)
+; RV32ZVBC32-NEXT:    sw t1, 116(sp)
+; RV32ZVBC32-NEXT:    sw zero, 104(sp)
+; RV32ZVBC32-NEXT:    sw t2, 108(sp)
+; RV32ZVBC32-NEXT:    sw zero, 96(sp)
+; RV32ZVBC32-NEXT:    sw t3, 100(sp)
+; RV32ZVBC32-NEXT:    sw zero, 88(sp)
+; RV32ZVBC32-NEXT:    sw t4, 92(sp)
+; RV32ZVBC32-NEXT:    sw zero, 80(sp)
+; RV32ZVBC32-NEXT:    sw t5, 84(sp)
+; RV32ZVBC32-NEXT:    sw zero, 72(sp)
+; RV32ZVBC32-NEXT:    sw t6, 76(sp)
+; RV32ZVBC32-NEXT:    sw zero, 64(sp)
+; RV32ZVBC32-NEXT:    sw s0, 68(sp)
+; RV32ZVBC32-NEXT:    sw zero, 56(sp)
+; RV32ZVBC32-NEXT:    sw s6, 60(sp)
+; RV32ZVBC32-NEXT:    sw zero, 48(sp)
+; RV32ZVBC32-NEXT:    sw s7, 52(sp)
+; RV32ZVBC32-NEXT:    sw zero, 40(sp)
+; RV32ZVBC32-NEXT:    sw s8, 44(sp)
+; RV32ZVBC32-NEXT:    sw zero, 32(sp)
+; RV32ZVBC32-NEXT:    sw a4, 36(sp)
+; RV32ZVBC32-NEXT:    lui a0, 61681
+; RV32ZVBC32-NEXT:    addi a0, a0, -241
+; RV32ZVBC32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; RV32ZVBC32-NEXT:    vmv.v.x v14, a0
+; RV32ZVBC32-NEXT:    lui a0, 209715
+; RV32ZVBC32-NEXT:    addi a0, a0, 819
+; RV32ZVBC32-NEXT:    vmv.v.x v10, a0
+; RV32ZVBC32-NEXT:    lui a0, 349525
+; RV32ZVBC32-NEXT:    addi a0, a0, 1365
+; RV32ZVBC32-NEXT:    vmv.v.x v11, a0
+; RV32ZVBC32-NEXT:    addi a0, sp, 264
+; RV32ZVBC32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32ZVBC32-NEXT:    vlse64.v v12, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 24
+; RV32ZVBC32-NEXT:    vlse64.v v7, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 288
+; RV32ZVBC32-NEXT:    vlse64.v v31, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 272
+; RV32ZVBC32-NEXT:    vlse64.v v30, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 280
+; RV32ZVBC32-NEXT:    vlse64.v v29, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 256
+; RV32ZVBC32-NEXT:    vlse64.v v26, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 248
+; RV32ZVBC32-NEXT:    vlse64.v v23, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 240
+; RV32ZVBC32-NEXT:    vlse64.v v20, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 232
+; RV32ZVBC32-NEXT:    vlse64.v v17, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 224
+; RV32ZVBC32-NEXT:    vlse64.v v15, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 216
+; RV32ZVBC32-NEXT:    vlse64.v v8, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 304
+; RV32ZVBC32-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi a0, sp, 208
+; RV32ZVBC32-NEXT:    vlse64.v v18, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 200
+; RV32ZVBC32-NEXT:    vlse64.v v16, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 192
+; RV32ZVBC32-NEXT:    vlse64.v v19, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 184
+; RV32ZVBC32-NEXT:    vlse64.v v21, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 176
+; RV32ZVBC32-NEXT:    vlse64.v v22, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 168
+; RV32ZVBC32-NEXT:    vlse64.v v24, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 160
+; RV32ZVBC32-NEXT:    vlse64.v v28, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 152
+; RV32ZVBC32-NEXT:    vlse64.v v25, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 144
+; RV32ZVBC32-NEXT:    vlse64.v v27, (a0), zero
+; RV32ZVBC32-NEXT:    li ra, 56
+; RV32ZVBC32-NEXT:    vsrl.vi v6, v9, 24
+; RV32ZVBC32-NEXT:    vsrl.vi v5, v9, 8
+; RV32ZVBC32-NEXT:    vsrl.vx v4, v9, ra
+; RV32ZVBC32-NEXT:    li a2, 40
+; RV32ZVBC32-NEXT:    vsrl.vx v3, v9, a2
+; RV32ZVBC32-NEXT:    lui a1, 4080
+; RV32ZVBC32-NEXT:    vand.vx v6, v6, a1
+; RV32ZVBC32-NEXT:    vsll.vx v2, v9, ra
+; RV32ZVBC32-NEXT:    addi a0, a5, -256
+; RV32ZVBC32-NEXT:    vand.vx v3, v3, a0
+; RV32ZVBC32-NEXT:    vand.vx v1, v9, a0
+; RV32ZVBC32-NEXT:    vor.vv v3, v3, v4
+; RV32ZVBC32-NEXT:    vsll.vx v4, v1, a2
+; RV32ZVBC32-NEXT:    vor.vv v2, v2, v4
+; RV32ZVBC32-NEXT:    vsrl.vx v4, v13, ra
+; RV32ZVBC32-NEXT:    vsrl.vx v1, v13, a2
+; RV32ZVBC32-NEXT:    vsll.vx v0, v13, ra
+; RV32ZVBC32-NEXT:    vand.vx v1, v1, a0
+; RV32ZVBC32-NEXT:    vor.vv v4, v1, v4
+; RV32ZVBC32-NEXT:    vand.vx v1, v13, a0
+; RV32ZVBC32-NEXT:    vsll.vx v1, v1, a2
+; RV32ZVBC32-NEXT:    vor.vv v1, v0, v1
+; RV32ZVBC32-NEXT:    vsrl.vi v0, v13, 24
+; RV32ZVBC32-NEXT:    vand.vv v5, v5, v12
+; RV32ZVBC32-NEXT:    vor.vv v8, v5, v6
+; RV32ZVBC32-NEXT:    vsrl.vi v6, v13, 8
+; RV32ZVBC32-NEXT:    vand.vx v5, v0, a1
+; RV32ZVBC32-NEXT:    vand.vv v6, v6, v12
+; RV32ZVBC32-NEXT:    vor.vv v5, v6, v5
+; RV32ZVBC32-NEXT:    addi a3, sp, 136
+; RV32ZVBC32-NEXT:    vlse64.v v6, (a3), zero
+; RV32ZVBC32-NEXT:    vor.vv v0, v5, v4
+; RV32ZVBC32-NEXT:    vand.vx v5, v13, a1
+; RV32ZVBC32-NEXT:    vsll.vi v5, v5, 24
+; RV32ZVBC32-NEXT:    vand.vv v13, v13, v12
+; RV32ZVBC32-NEXT:    vsll.vi v13, v13, 8
+; RV32ZVBC32-NEXT:    vor.vv v13, v5, v13
+; RV32ZVBC32-NEXT:    addi a3, sp, 128
+; RV32ZVBC32-NEXT:    vlse64.v v5, (a3), zero
+; RV32ZVBC32-NEXT:    vor.vv v13, v1, v13
+; RV32ZVBC32-NEXT:    addi a3, sp, 120
+; RV32ZVBC32-NEXT:    vlse64.v v4, (a3), zero
+; RV32ZVBC32-NEXT:    vor.vv v13, v13, v0
+; RV32ZVBC32-NEXT:    vsrl.vi v1, v13, 4
+; RV32ZVBC32-NEXT:    vand.vv v13, v13, v14
+; RV32ZVBC32-NEXT:    vand.vv v1, v1, v14
+; RV32ZVBC32-NEXT:    vsll.vi v13, v13, 4
+; RV32ZVBC32-NEXT:    vor.vv v13, v1, v13
+; RV32ZVBC32-NEXT:    vsrl.vi v1, v13, 2
+; RV32ZVBC32-NEXT:    vand.vv v13, v13, v10
+; RV32ZVBC32-NEXT:    vand.vv v1, v1, v10
+; RV32ZVBC32-NEXT:    vsll.vi v13, v13, 2
+; RV32ZVBC32-NEXT:    vor.vv v13, v1, v13
+; RV32ZVBC32-NEXT:    vsrl.vi v1, v13, 1
+; RV32ZVBC32-NEXT:    vand.vv v13, v13, v11
+; RV32ZVBC32-NEXT:    vand.vv v1, v1, v11
+; RV32ZVBC32-NEXT:    vadd.vv v13, v13, v13
+; RV32ZVBC32-NEXT:    vor.vv v13, v1, v13
+; RV32ZVBC32-NEXT:    vand.vx v1, v13, s10
+; RV32ZVBC32-NEXT:    vor.vv v8, v8, v3
+; RV32ZVBC32-NEXT:    vand.vx v3, v9, a1
+; RV32ZVBC32-NEXT:    vsll.vi v3, v3, 24
+; RV32ZVBC32-NEXT:    vand.vv v9, v9, v12
+; RV32ZVBC32-NEXT:    vsll.vi v9, v9, 8
+; RV32ZVBC32-NEXT:    vor.vv v9, v3, v9
+; RV32ZVBC32-NEXT:    vand.vx v3, v13, s9
+; RV32ZVBC32-NEXT:    vor.vv v9, v2, v9
+; RV32ZVBC32-NEXT:    vor.vv v8, v9, v8
+; RV32ZVBC32-NEXT:    vsrl.vi v9, v8, 4
+; RV32ZVBC32-NEXT:    vand.vv v8, v8, v14
+; RV32ZVBC32-NEXT:    vand.vv v9, v9, v14
+; RV32ZVBC32-NEXT:    vsll.vi v8, v8, 4
+; RV32ZVBC32-NEXT:    vor.vv v8, v9, v8
+; RV32ZVBC32-NEXT:    vsrl.vi v9, v8, 2
+; RV32ZVBC32-NEXT:    vand.vv v8, v8, v10
+; RV32ZVBC32-NEXT:    vand.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vsll.vi v8, v8, 2
+; RV32ZVBC32-NEXT:    vor.vv v8, v9, v8
+; RV32ZVBC32-NEXT:    vsrl.vi v9, v8, 1
+; RV32ZVBC32-NEXT:    vand.vv v8, v8, v11
+; RV32ZVBC32-NEXT:    vand.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vadd.vv v8, v8, v8
+; RV32ZVBC32-NEXT:    vor.vv v8, v9, v8
+; RV32ZVBC32-NEXT:    vand.vi v9, v13, 2
+; RV32ZVBC32-NEXT:    vand.vi v2, v13, 1
+; RV32ZVBC32-NEXT:    vmul.vv v9, v8, v9
+; RV32ZVBC32-NEXT:    vmul.vv v2, v8, v2
+; RV32ZVBC32-NEXT:    vxor.vv v9, v2, v9
+; RV32ZVBC32-NEXT:    vand.vi v2, v13, 4
+; RV32ZVBC32-NEXT:    vmul.vv v2, v8, v2
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v2
+; RV32ZVBC32-NEXT:    vand.vi v2, v13, 8
+; RV32ZVBC32-NEXT:    vmul.vv v2, v8, v2
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v2
+; RV32ZVBC32-NEXT:    vand.vx v2, v13, s1
+; RV32ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v1
+; RV32ZVBC32-NEXT:    vand.vx v1, v13, s2
+; RV32ZVBC32-NEXT:    vmul.vv v3, v8, v3
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v3
+; RV32ZVBC32-NEXT:    vand.vx v0, v13, s3
+; RV32ZVBC32-NEXT:    vmul.vv v3, v8, v2
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v3
+; RV32ZVBC32-NEXT:    vand.vx v2, v13, s4
+; RV32ZVBC32-NEXT:    vmul.vv v3, v8, v1
+; RV32ZVBC32-NEXT:    vxor.vv v3, v9, v3
+; RV32ZVBC32-NEXT:    vand.vx v9, v13, s5
+; RV32ZVBC32-NEXT:    vmul.vv v1, v8, v0
+; RV32ZVBC32-NEXT:    vmul.vv v2, v8, v2
+; RV32ZVBC32-NEXT:    vxor.vv v1, v3, v1
+; RV32ZVBC32-NEXT:    vxor.vv v2, v1, v2
+; RV32ZVBC32-NEXT:    vand.vx v1, v13, s11
+; RV32ZVBC32-NEXT:    vmul.vv v9, v8, v9
+; RV32ZVBC32-NEXT:    vxor.vv v9, v2, v9
+; RV32ZVBC32-NEXT:    lui a3, 1
+; RV32ZVBC32-NEXT:    vand.vx v2, v13, a3
+; RV32ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v1
+; RV32ZVBC32-NEXT:    lui a3, 2
+; RV32ZVBC32-NEXT:    vand.vx v1, v13, a3
+; RV32ZVBC32-NEXT:    vmul.vv v2, v8, v2
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v2
+; RV32ZVBC32-NEXT:    lui a3, 4
+; RV32ZVBC32-NEXT:    vand.vx v2, v13, a3
+; RV32ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v1
+; RV32ZVBC32-NEXT:    lui a3, 8
+; RV32ZVBC32-NEXT:    vand.vx v1, v13, a3
+; RV32ZVBC32-NEXT:    vmul.vv v2, v8, v2
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v2
+; RV32ZVBC32-NEXT:    vand.vx v2, v13, a5
+; RV32ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v1
+; RV32ZVBC32-NEXT:    lui a3, 32
+; RV32ZVBC32-NEXT:    vand.vx v1, v13, a3
+; RV32ZVBC32-NEXT:    vmul.vv v2, v8, v2
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v2
+; RV32ZVBC32-NEXT:    vand.vx v2, v13, a6
+; RV32ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v1
+; RV32ZVBC32-NEXT:    vand.vx v1, v13, a7
+; RV32ZVBC32-NEXT:    vmul.vv v2, v8, v2
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v2
+; RV32ZVBC32-NEXT:    vand.vx v2, v13, t0
+; RV32ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v1
+; RV32ZVBC32-NEXT:    vand.vx v1, v13, t1
+; RV32ZVBC32-NEXT:    vmul.vv v2, v8, v2
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v2
+; RV32ZVBC32-NEXT:    vand.vx v2, v13, t2
+; RV32ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v1
+; RV32ZVBC32-NEXT:    vand.vx v1, v13, t3
+; RV32ZVBC32-NEXT:    vmul.vv v2, v8, v2
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v2
+; RV32ZVBC32-NEXT:    vand.vx v0, v13, t4
+; RV32ZVBC32-NEXT:    vmul.vv v2, v8, v1
+; RV32ZVBC32-NEXT:    vxor.vv v2, v9, v2
+; RV32ZVBC32-NEXT:    vand.vx v9, v13, t5
+; RV32ZVBC32-NEXT:    vmul.vv v1, v8, v0
+; RV32ZVBC32-NEXT:    vmul.vv v9, v8, v9
+; RV32ZVBC32-NEXT:    vxor.vv v1, v2, v1
+; RV32ZVBC32-NEXT:    vxor.vv v9, v1, v9
+; RV32ZVBC32-NEXT:    vand.vx v1, v13, t6
+; RV32ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v1
+; RV32ZVBC32-NEXT:    vand.vx v1, v13, s0
+; RV32ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v1
+; RV32ZVBC32-NEXT:    vand.vx v1, v13, s6
+; RV32ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v1
+; RV32ZVBC32-NEXT:    vand.vx v1, v13, s7
+; RV32ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v1
+; RV32ZVBC32-NEXT:    vand.vx v1, v13, s8
+; RV32ZVBC32-NEXT:    addi a3, sp, 112
+; RV32ZVBC32-NEXT:    vmul.vv v1, v8, v1
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v1
+; RV32ZVBC32-NEXT:    vlse64.v v1, (a3), zero
+; RV32ZVBC32-NEXT:    addi a3, sp, 104
+; RV32ZVBC32-NEXT:    vand.vv v7, v13, v7
+; RV32ZVBC32-NEXT:    vmul.vv v7, v8, v7
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v7
+; RV32ZVBC32-NEXT:    vlse64.v v7, (a3), zero
+; RV32ZVBC32-NEXT:    addi a3, sp, 96
+; RV32ZVBC32-NEXT:    vand.vv v31, v13, v31
+; RV32ZVBC32-NEXT:    vmul.vv v31, v8, v31
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v31
+; RV32ZVBC32-NEXT:    vlse64.v v31, (a3), zero
+; RV32ZVBC32-NEXT:    addi a3, sp, 88
+; RV32ZVBC32-NEXT:    vand.vv v30, v13, v30
+; RV32ZVBC32-NEXT:    vmul.vv v30, v8, v30
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v30
+; RV32ZVBC32-NEXT:    vlse64.v v30, (a3), zero
+; RV32ZVBC32-NEXT:    addi a3, sp, 80
+; RV32ZVBC32-NEXT:    vand.vv v29, v13, v29
+; RV32ZVBC32-NEXT:    vmul.vv v29, v8, v29
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v29
+; RV32ZVBC32-NEXT:    vlse64.v v29, (a3), zero
+; RV32ZVBC32-NEXT:    addi a3, sp, 72
+; RV32ZVBC32-NEXT:    vand.vv v26, v13, v26
+; RV32ZVBC32-NEXT:    vmul.vv v26, v8, v26
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v26
+; RV32ZVBC32-NEXT:    vlse64.v v26, (a3), zero
+; RV32ZVBC32-NEXT:    addi a3, sp, 64
+; RV32ZVBC32-NEXT:    vand.vv v23, v13, v23
+; RV32ZVBC32-NEXT:    vmul.vv v23, v8, v23
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v23
+; RV32ZVBC32-NEXT:    vlse64.v v23, (a3), zero
+; RV32ZVBC32-NEXT:    addi a3, sp, 56
+; RV32ZVBC32-NEXT:    vand.vv v20, v13, v20
+; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v20
+; RV32ZVBC32-NEXT:    vlse64.v v20, (a3), zero
+; RV32ZVBC32-NEXT:    addi a3, sp, 48
+; RV32ZVBC32-NEXT:    vand.vv v17, v13, v17
+; RV32ZVBC32-NEXT:    vmul.vv v17, v8, v17
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v17
+; RV32ZVBC32-NEXT:    vlse64.v v17, (a3), zero
+; RV32ZVBC32-NEXT:    addi a3, sp, 40
+; RV32ZVBC32-NEXT:    vand.vv v15, v13, v15
+; RV32ZVBC32-NEXT:    vmul.vv v15, v8, v15
+; RV32ZVBC32-NEXT:    vxor.vv v15, v9, v15
+; RV32ZVBC32-NEXT:    vlse64.v v0, (a3), zero
+; RV32ZVBC32-NEXT:    addi a3, sp, 32
+; RV32ZVBC32-NEXT:    addi a4, sp, 304
+; RV32ZVBC32-NEXT:    vl1r.v v9, (a4) # vscale x 8-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v9, v13, v9
+; RV32ZVBC32-NEXT:    vand.vv v18, v13, v18
+; RV32ZVBC32-NEXT:    vmul.vv v9, v8, v9
+; RV32ZVBC32-NEXT:    vmul.vv v18, v8, v18
+; RV32ZVBC32-NEXT:    vxor.vv v9, v15, v9
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v18
+; RV32ZVBC32-NEXT:    vlse64.v v18, (a3), zero
+; RV32ZVBC32-NEXT:    vand.vv v16, v13, v16
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v16
+; RV32ZVBC32-NEXT:    vand.vv v16, v13, v19
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v16
+; RV32ZVBC32-NEXT:    vand.vv v16, v13, v21
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v16
+; RV32ZVBC32-NEXT:    vand.vv v16, v13, v22
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v16
+; RV32ZVBC32-NEXT:    vand.vv v16, v13, v24
+; RV32ZVBC32-NEXT:    vand.vv v19, v13, v28
+; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vmul.vv v19, v8, v19
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v16
+; RV32ZVBC32-NEXT:    vand.vx v16, v15, a1
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v19
+; RV32ZVBC32-NEXT:    vsrl.vi v19, v9, 24
+; RV32ZVBC32-NEXT:    vand.vx v19, v19, a1
+; RV32ZVBC32-NEXT:    vand.vv v21, v13, v25
+; RV32ZVBC32-NEXT:    vmul.vv v21, v8, v21
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v21
+; RV32ZVBC32-NEXT:    vand.vv v21, v13, v27
+; RV32ZVBC32-NEXT:    vmul.vv v21, v8, v21
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v21
+; RV32ZVBC32-NEXT:    vand.vv v21, v13, v6
+; RV32ZVBC32-NEXT:    vmul.vv v21, v8, v21
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v21
+; RV32ZVBC32-NEXT:    vand.vv v21, v13, v5
+; RV32ZVBC32-NEXT:    vmul.vv v21, v8, v21
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v21
+; RV32ZVBC32-NEXT:    vand.vv v21, v13, v4
+; RV32ZVBC32-NEXT:    vmul.vv v21, v8, v21
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v21
+; RV32ZVBC32-NEXT:    vand.vv v21, v13, v1
+; RV32ZVBC32-NEXT:    vmul.vv v21, v8, v21
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v21
+; RV32ZVBC32-NEXT:    vand.vv v21, v13, v7
+; RV32ZVBC32-NEXT:    vmul.vv v21, v8, v21
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v21
+; RV32ZVBC32-NEXT:    vand.vv v21, v13, v31
+; RV32ZVBC32-NEXT:    vmul.vv v21, v8, v21
+; RV32ZVBC32-NEXT:    vand.vx v22, v2, a0
+; RV32ZVBC32-NEXT:    vsll.vx v22, v22, a2
+; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v21
+; RV32ZVBC32-NEXT:    vsrl.vx v21, v9, a2
+; RV32ZVBC32-NEXT:    vand.vx v21, v21, a0
+; RV32ZVBC32-NEXT:    vand.vv v24, v13, v30
+; RV32ZVBC32-NEXT:    vand.vv v25, v13, v29
+; RV32ZVBC32-NEXT:    vand.vv v26, v13, v26
+; RV32ZVBC32-NEXT:    vand.vv v23, v13, v23
+; RV32ZVBC32-NEXT:    vand.vv v20, v13, v20
+; RV32ZVBC32-NEXT:    vand.vv v17, v13, v17
+; RV32ZVBC32-NEXT:    vand.vv v27, v13, v0
+; RV32ZVBC32-NEXT:    vand.vv v13, v13, v18
+; RV32ZVBC32-NEXT:    vmul.vv v18, v8, v24
+; RV32ZVBC32-NEXT:    vmul.vv v24, v8, v25
+; RV32ZVBC32-NEXT:    vmul.vv v25, v8, v26
+; RV32ZVBC32-NEXT:    vmul.vv v23, v8, v23
+; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC32-NEXT:    vmul.vv v17, v8, v17
+; RV32ZVBC32-NEXT:    vmul.vv v26, v8, v27
+; RV32ZVBC32-NEXT:    vmul.vv v8, v8, v13
+; RV32ZVBC32-NEXT:    vxor.vv v13, v9, v18
+; RV32ZVBC32-NEXT:    vxor.vv v13, v13, v24
+; RV32ZVBC32-NEXT:    vxor.vv v13, v13, v25
+; RV32ZVBC32-NEXT:    vxor.vv v13, v13, v23
+; RV32ZVBC32-NEXT:    vxor.vv v13, v13, v20
+; RV32ZVBC32-NEXT:    vxor.vv v13, v13, v17
+; RV32ZVBC32-NEXT:    vxor.vv v13, v13, v26
+; RV32ZVBC32-NEXT:    vxor.vv v8, v13, v8
+; RV32ZVBC32-NEXT:    vsll.vx v13, v3, ra
+; RV32ZVBC32-NEXT:    vsrl.vx v8, v8, ra
+; RV32ZVBC32-NEXT:    vor.vv v13, v13, v22
+; RV32ZVBC32-NEXT:    vsrl.vi v15, v15, 8
+; RV32ZVBC32-NEXT:    vsll.vi v16, v16, 24
+; RV32ZVBC32-NEXT:    vand.vv v15, v15, v12
+; RV32ZVBC32-NEXT:    vor.vv v15, v15, v19
+; RV32ZVBC32-NEXT:    vand.vv v9, v9, v12
+; RV32ZVBC32-NEXT:    vsll.vi v9, v9, 8
+; RV32ZVBC32-NEXT:    vor.vv v9, v16, v9
+; RV32ZVBC32-NEXT:    vor.vv v9, v13, v9
+; RV32ZVBC32-NEXT:    vor.vv v8, v21, v8
+; RV32ZVBC32-NEXT:    vor.vv v8, v15, v8
+; RV32ZVBC32-NEXT:    vor.vv v8, v9, v8
+; RV32ZVBC32-NEXT:    vsrl.vi v9, v8, 4
+; RV32ZVBC32-NEXT:    vand.vv v8, v8, v14
+; RV32ZVBC32-NEXT:    vand.vv v9, v9, v14
+; RV32ZVBC32-NEXT:    vsll.vi v8, v8, 4
+; RV32ZVBC32-NEXT:    vor.vv v8, v9, v8
+; RV32ZVBC32-NEXT:    vsrl.vi v9, v8, 2
+; RV32ZVBC32-NEXT:    vand.vv v8, v8, v10
+; RV32ZVBC32-NEXT:    vand.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vsll.vi v8, v8, 2
+; RV32ZVBC32-NEXT:    vor.vv v8, v9, v8
+; RV32ZVBC32-NEXT:    vsrl.vi v9, v8, 1
+; RV32ZVBC32-NEXT:    vand.vv v8, v8, v11
+; RV32ZVBC32-NEXT:    vand.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vadd.vv v8, v8, v8
+; RV32ZVBC32-NEXT:    vor.vv v8, v9, v8
+; RV32ZVBC32-NEXT:    vsrl.vi v8, v8, 1
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    add sp, sp, a0
+; RV32ZVBC32-NEXT:    lw ra, 364(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s0, 360(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s1, 356(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s2, 352(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s3, 348(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s4, 344(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s5, 340(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s6, 336(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s7, 332(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s8, 328(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s9, 324(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s10, 320(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s11, 316(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    addi sp, sp, 368
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmulh_nxv1i64_vx:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    li a1, 56
+; RV64ZVBC32-NEXT:    lui t2, 16
+; RV64ZVBC32-NEXT:    lui a2, 4080
+; RV64ZVBC32-NEXT:    li t0, 255
+; RV64ZVBC32-NEXT:    lui a3, 61681
+; RV64ZVBC32-NEXT:    lui a4, 209715
+; RV64ZVBC32-NEXT:    lui a5, 349525
+; RV64ZVBC32-NEXT:    srli a6, a0, 24
+; RV64ZVBC32-NEXT:    srli a7, a0, 8
+; RV64ZVBC32-NEXT:    srli t1, a0, 40
+; RV64ZVBC32-NEXT:    srli t3, a0, 56
+; RV64ZVBC32-NEXT:    addi a3, a3, -241
+; RV64ZVBC32-NEXT:    addi a4, a4, 819
+; RV64ZVBC32-NEXT:    addi t4, a5, 1365
+; RV64ZVBC32-NEXT:    slli a5, a3, 32
+; RV64ZVBC32-NEXT:    add a5, a3, a5
+; RV64ZVBC32-NEXT:    slli a3, a4, 32
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, t4, 32
+; RV64ZVBC32-NEXT:    add a3, t4, a3
+; RV64ZVBC32-NEXT:    srliw t4, a0, 24
+; RV64ZVBC32-NEXT:    slli t0, t0, 24
+; RV64ZVBC32-NEXT:    and a6, a6, a2
+; RV64ZVBC32-NEXT:    and a7, a7, t0
+; RV64ZVBC32-NEXT:    or t5, a7, a6
+; RV64ZVBC32-NEXT:    addi a6, t2, -256
+; RV64ZVBC32-NEXT:    and a7, t1, a6
+; RV64ZVBC32-NEXT:    or t1, a7, t3
+; RV64ZVBC32-NEXT:    and a7, a0, a2
+; RV64ZVBC32-NEXT:    slli t4, t4, 32
+; RV64ZVBC32-NEXT:    slli a7, a7, 24
+; RV64ZVBC32-NEXT:    or t3, a7, t4
+; RV64ZVBC32-NEXT:    li a7, 40
+; RV64ZVBC32-NEXT:    vsetvli t4, zero, e64, m1, ta, ma
+; RV64ZVBC32-NEXT:    vsrl.vi v10, v8, 24
+; RV64ZVBC32-NEXT:    vsrl.vi v9, v8, 8
+; RV64ZVBC32-NEXT:    or t1, t5, t1
+; RV64ZVBC32-NEXT:    slli t4, a0, 56
+; RV64ZVBC32-NEXT:    and a0, a0, a6
+; RV64ZVBC32-NEXT:    slli a0, a0, 40
+; RV64ZVBC32-NEXT:    or t4, t4, a0
+; RV64ZVBC32-NEXT:    li a0, 1
+; RV64ZVBC32-NEXT:    or t4, t4, t3
+; RV64ZVBC32-NEXT:    lui t3, 1
+; RV64ZVBC32-NEXT:    vsrl.vx v11, v8, a1
+; RV64ZVBC32-NEXT:    vsrl.vx v12, v8, a7
+; RV64ZVBC32-NEXT:    vand.vx v10, v10, a2
+; RV64ZVBC32-NEXT:    vand.vx v13, v8, a2
+; RV64ZVBC32-NEXT:    vsll.vx v14, v8, a1
+; RV64ZVBC32-NEXT:    vand.vx v12, v12, a6
+; RV64ZVBC32-NEXT:    vand.vx v9, v9, t0
+; RV64ZVBC32-NEXT:    vsll.vi v13, v13, 24
+; RV64ZVBC32-NEXT:    vor.vv v11, v12, v11
+; RV64ZVBC32-NEXT:    vand.vx v12, v8, t0
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, a6
+; RV64ZVBC32-NEXT:    vor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vsll.vi v10, v12, 8
+; RV64ZVBC32-NEXT:    vsll.vx v8, v8, a7
+; RV64ZVBC32-NEXT:    vor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vor.vv v10, v13, v10
+; RV64ZVBC32-NEXT:    vor.vv v8, v14, v8
+; RV64ZVBC32-NEXT:    vor.vv v8, v8, v10
+; RV64ZVBC32-NEXT:    vor.vv v8, v8, v9
+; RV64ZVBC32-NEXT:    or t1, t4, t1
+; RV64ZVBC32-NEXT:    vsrl.vi v9, v8, 4
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, a5
+; RV64ZVBC32-NEXT:    srli t4, t1, 4
+; RV64ZVBC32-NEXT:    and t1, t1, a5
+; RV64ZVBC32-NEXT:    vand.vx v9, v9, a5
+; RV64ZVBC32-NEXT:    vsll.vi v8, v8, 4
+; RV64ZVBC32-NEXT:    and t4, t4, a5
+; RV64ZVBC32-NEXT:    slli t1, t1, 4
+; RV64ZVBC32-NEXT:    vor.vv v8, v9, v8
+; RV64ZVBC32-NEXT:    or t1, t4, t1
+; RV64ZVBC32-NEXT:    vsrl.vi v9, v8, 2
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, a4
+; RV64ZVBC32-NEXT:    srli t4, t1, 2
+; RV64ZVBC32-NEXT:    and t1, t1, a4
+; RV64ZVBC32-NEXT:    vand.vx v9, v9, a4
+; RV64ZVBC32-NEXT:    vsll.vi v8, v8, 2
+; RV64ZVBC32-NEXT:    and t4, t4, a4
+; RV64ZVBC32-NEXT:    slli t1, t1, 2
+; RV64ZVBC32-NEXT:    vor.vv v8, v9, v8
+; RV64ZVBC32-NEXT:    or t1, t4, t1
+; RV64ZVBC32-NEXT:    vsrl.vi v9, v8, 1
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, a3
+; RV64ZVBC32-NEXT:    srli t4, t1, 1
+; RV64ZVBC32-NEXT:    and t1, t1, a3
+; RV64ZVBC32-NEXT:    vand.vx v9, v9, a3
+; RV64ZVBC32-NEXT:    vadd.vv v8, v8, v8
+; RV64ZVBC32-NEXT:    and t4, t4, a3
+; RV64ZVBC32-NEXT:    slli t1, t1, 1
+; RV64ZVBC32-NEXT:    vor.vv v8, v9, v8
+; RV64ZVBC32-NEXT:    or t1, t4, t1
+; RV64ZVBC32-NEXT:    andi t4, t1, 2
+; RV64ZVBC32-NEXT:    vmul.vx v9, v8, t4
+; RV64ZVBC32-NEXT:    andi t4, t1, 1
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, t4
+; RV64ZVBC32-NEXT:    andi t4, t1, 4
+; RV64ZVBC32-NEXT:    vmul.vx v11, v8, t4
+; RV64ZVBC32-NEXT:    andi t4, t1, 8
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, t4
+; RV64ZVBC32-NEXT:    andi t4, t1, 16
+; RV64ZVBC32-NEXT:    vmul.vx v13, v8, t4
+; RV64ZVBC32-NEXT:    andi t4, t1, 32
+; RV64ZVBC32-NEXT:    vmul.vx v14, v8, t4
+; RV64ZVBC32-NEXT:    andi t4, t1, 64
+; RV64ZVBC32-NEXT:    vxor.vv v9, v10, v9
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, t4
+; RV64ZVBC32-NEXT:    andi t4, t1, 128
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vmul.vx v11, v8, t4
+; RV64ZVBC32-NEXT:    andi t4, t1, 256
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, t4
+; RV64ZVBC32-NEXT:    andi t4, t1, 512
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v13
+; RV64ZVBC32-NEXT:    vmul.vx v13, v8, t4
+; RV64ZVBC32-NEXT:    andi t4, t1, 1024
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v14
+; RV64ZVBC32-NEXT:    vmul.vx v14, v8, t4
+; RV64ZVBC32-NEXT:    slli t4, a0, 11
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, t4
+; RV64ZVBC32-NEXT:    lui t4, 2
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vmul.vx v11, v8, t3
+; RV64ZVBC32-NEXT:    lui t3, 4
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v12, v9, v12
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v13
+; RV64ZVBC32-NEXT:    vmul.vx v13, v8, t4
+; RV64ZVBC32-NEXT:    lui t4, 8
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vmul.vx v14, v8, t3
+; RV64ZVBC32-NEXT:    lui t3, 32
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v10, v12, v10
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, t4
+; RV64ZVBC32-NEXT:    lui t4, 64
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vmul.vx v11, v8, t2
+; RV64ZVBC32-NEXT:    lui t2, 128
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v13
+; RV64ZVBC32-NEXT:    vmul.vx v13, v8, t3
+; RV64ZVBC32-NEXT:    lui t3, 256
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v14
+; RV64ZVBC32-NEXT:    vmul.vx v14, v8, t4
+; RV64ZVBC32-NEXT:    lui t4, 512
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, t2
+; RV64ZVBC32-NEXT:    lui t2, 1024
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vmul.vx v11, v8, t3
+; RV64ZVBC32-NEXT:    lui t3, 2048
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v13
+; RV64ZVBC32-NEXT:    vmul.vx v13, v8, t4
+; RV64ZVBC32-NEXT:    lui t4, 4096
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v14
+; RV64ZVBC32-NEXT:    vmul.vx v14, v8, t2
+; RV64ZVBC32-NEXT:    lui t2, 8192
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, t3
+; RV64ZVBC32-NEXT:    lui t3, 16384
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vmul.vx v11, v8, t4
+; RV64ZVBC32-NEXT:    lui t4, 32768
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v13
+; RV64ZVBC32-NEXT:    vmul.vx v13, v8, t2
+; RV64ZVBC32-NEXT:    lui t2, 65536
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v14
+; RV64ZVBC32-NEXT:    vmul.vx v14, v8, t3
+; RV64ZVBC32-NEXT:    lui t3, 131072
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, t4
+; RV64ZVBC32-NEXT:    lui t4, 262144
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vxor.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    vxor.vv v11, v11, v13
+; RV64ZVBC32-NEXT:    vmul.vx v13, v8, t2
+; RV64ZVBC32-NEXT:    slli t2, a0, 32
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vsll.vx v9, v9, a1
+; RV64ZVBC32-NEXT:    vand.vx v10, v10, a6
+; RV64ZVBC32-NEXT:    vsll.vx v10, v10, a7
+; RV64ZVBC32-NEXT:    vor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, t3
+; RV64ZVBC32-NEXT:    slli t3, a0, 33
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v11, v11, v14
+; RV64ZVBC32-NEXT:    vmul.vx v14, v8, t4
+; RV64ZVBC32-NEXT:    slli t4, a0, 34
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vxor.vv v11, v11, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, t2
+; RV64ZVBC32-NEXT:    slli t2, a0, 35
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vxor.vv v11, v11, v13
+; RV64ZVBC32-NEXT:    vmul.vx v13, v8, t3
+; RV64ZVBC32-NEXT:    slli t3, a0, 36
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v10, v11, v10
+; RV64ZVBC32-NEXT:    vmul.vx v11, v8, t4
+; RV64ZVBC32-NEXT:    srliw t4, t1, 31
+; RV64ZVBC32-NEXT:    slli t4, t4, 31
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v14
+; RV64ZVBC32-NEXT:    vmul.vx v14, v8, t4
+; RV64ZVBC32-NEXT:    slli t4, a0, 37
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v14
+; RV64ZVBC32-NEXT:    vmul.vx v14, v8, t2
+; RV64ZVBC32-NEXT:    slli t2, a0, 38
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, t3
+; RV64ZVBC32-NEXT:    slli t3, a0, 39
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v13
+; RV64ZVBC32-NEXT:    vmul.vx v13, v8, t4
+; RV64ZVBC32-NEXT:    slli t4, a0, 40
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vmul.vx v11, v8, t2
+; RV64ZVBC32-NEXT:    slli t2, a0, 41
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v14
+; RV64ZVBC32-NEXT:    vmul.vx v14, v8, t3
+; RV64ZVBC32-NEXT:    slli t3, a0, 42
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, t4
+; RV64ZVBC32-NEXT:    slli t4, a0, 43
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v13
+; RV64ZVBC32-NEXT:    vmul.vx v13, v8, t2
+; RV64ZVBC32-NEXT:    slli t2, a0, 44
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vmul.vx v11, v8, t3
+; RV64ZVBC32-NEXT:    slli t3, a0, 45
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v14
+; RV64ZVBC32-NEXT:    vmul.vx v14, v8, t4
+; RV64ZVBC32-NEXT:    slli t4, a0, 46
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vxor.vv v12, v10, v12
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v13
+; RV64ZVBC32-NEXT:    vmul.vx v13, v8, t2
+; RV64ZVBC32-NEXT:    slli t2, a0, 47
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vxor.vv v11, v12, v11
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, t3
+; RV64ZVBC32-NEXT:    slli t3, a0, 48
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v11, v11, v14
+; RV64ZVBC32-NEXT:    vmul.vx v14, v8, t4
+; RV64ZVBC32-NEXT:    slli t4, a0, 49
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vxor.vv v11, v11, v13
+; RV64ZVBC32-NEXT:    vmul.vx v13, v8, t2
+; RV64ZVBC32-NEXT:    slli t2, a0, 50
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vxor.vv v11, v11, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, t3
+; RV64ZVBC32-NEXT:    slli t3, a0, 51
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v11, v11, v14
+; RV64ZVBC32-NEXT:    vmul.vx v14, v8, t4
+; RV64ZVBC32-NEXT:    slli t4, a0, 52
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vxor.vv v11, v11, v13
+; RV64ZVBC32-NEXT:    vmul.vx v13, v8, t2
+; RV64ZVBC32-NEXT:    slli t2, a0, 53
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vxor.vv v12, v11, v12
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vmul.vx v14, v8, t3
+; RV64ZVBC32-NEXT:    slli t3, a0, 54
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v13
+; RV64ZVBC32-NEXT:    vsrl.vi v13, v10, 8
+; RV64ZVBC32-NEXT:    vand.vx v13, v13, t0
+; RV64ZVBC32-NEXT:    vsrl.vi v11, v11, 24
+; RV64ZVBC32-NEXT:    vand.vx v11, v11, a2
+; RV64ZVBC32-NEXT:    vor.vv v11, v13, v11
+; RV64ZVBC32-NEXT:    vmul.vx v13, v8, t4
+; RV64ZVBC32-NEXT:    slli t4, a0, 55
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vmul.vx v14, v8, t2
+; RV64ZVBC32-NEXT:    slli t2, a0, 56
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v13
+; RV64ZVBC32-NEXT:    vmul.vx v13, v8, t3
+; RV64ZVBC32-NEXT:    slli t3, a0, 57
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vmul.vx v14, v8, t4
+; RV64ZVBC32-NEXT:    slli t4, a0, 58
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v13
+; RV64ZVBC32-NEXT:    vmul.vx v13, v8, t2
+; RV64ZVBC32-NEXT:    slli t2, a0, 59
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vmul.vx v14, v8, t3
+; RV64ZVBC32-NEXT:    slli t3, a0, 60
+; RV64ZVBC32-NEXT:    vand.vx v10, v10, a2
+; RV64ZVBC32-NEXT:    slli a2, a0, 61
+; RV64ZVBC32-NEXT:    slli a0, a0, 62
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    and a2, t1, a2
+; RV64ZVBC32-NEXT:    and a0, t1, a0
+; RV64ZVBC32-NEXT:    srli t1, t1, 63
+; RV64ZVBC32-NEXT:    vsll.vi v10, v10, 24
+; RV64ZVBC32-NEXT:    vxor.vv v13, v12, v13
+; RV64ZVBC32-NEXT:    vxor.vv v13, v13, v14
+; RV64ZVBC32-NEXT:    vand.vx v14, v12, t0
+; RV64ZVBC32-NEXT:    vsll.vi v14, v14, 8
+; RV64ZVBC32-NEXT:    vor.vv v10, v10, v14
+; RV64ZVBC32-NEXT:    vmul.vx v14, v8, t4
+; RV64ZVBC32-NEXT:    vxor.vv v13, v13, v14
+; RV64ZVBC32-NEXT:    vmul.vx v14, v8, t2
+; RV64ZVBC32-NEXT:    vor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, t3
+; RV64ZVBC32-NEXT:    vxor.vv v13, v13, v14
+; RV64ZVBC32-NEXT:    vmul.vx v14, v8, a2
+; RV64ZVBC32-NEXT:    vxor.vv v10, v13, v10
+; RV64ZVBC32-NEXT:    vmul.vx v13, v8, a0
+; RV64ZVBC32-NEXT:    slli t1, t1, 63
+; RV64ZVBC32-NEXT:    vmul.vx v8, v8, t1
+; RV64ZVBC32-NEXT:    vsrl.vx v12, v12, a7
+; RV64ZVBC32-NEXT:    vand.vx v12, v12, a6
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v13
+; RV64ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    vsrl.vx v8, v8, a1
+; RV64ZVBC32-NEXT:    vor.vv v8, v12, v8
+; RV64ZVBC32-NEXT:    vor.vv v8, v11, v8
+; RV64ZVBC32-NEXT:    vor.vv v8, v9, v8
+; RV64ZVBC32-NEXT:    vsrl.vi v9, v8, 4
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, a5
+; RV64ZVBC32-NEXT:    vand.vx v9, v9, a5
+; RV64ZVBC32-NEXT:    vsll.vi v8, v8, 4
+; RV64ZVBC32-NEXT:    vor.vv v8, v9, v8
+; RV64ZVBC32-NEXT:    vsrl.vi v9, v8, 2
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, a4
+; RV64ZVBC32-NEXT:    vand.vx v9, v9, a4
+; RV64ZVBC32-NEXT:    vsll.vi v8, v8, 2
+; RV64ZVBC32-NEXT:    vor.vv v8, v9, v8
+; RV64ZVBC32-NEXT:    vsrl.vi v9, v8, 1
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, a3
+; RV64ZVBC32-NEXT:    vand.vx v9, v9, a3
+; RV64ZVBC32-NEXT:    vadd.vv v8, v8, v8
+; RV64ZVBC32-NEXT:    vor.vv v8, v9, v8
+; RV64ZVBC32-NEXT:    vsrl.vi v8, v8, 1
+; RV64ZVBC32-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x i64> poison, i64 %b, i128 0
   %vb = shufflevector <vscale x 1 x i64> %elt.head, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
   %va.ext = zext <vscale x 1 x i64> %va to <vscale x 1 x i128>
@@ -19444,17 +26499,5102 @@ define <vscale x 2 x i64> @clmulh_nxv2i64_vv(<vscale x 2 x i64> %va, <vscale x 2
 ; RV64V-NEXT:    addi sp, sp, 320
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmulh_nxv2i64_vv:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV32ZVBC-NEXT:    vclmulh.vv v8, v8, v10
-; RV32ZVBC-NEXT:    ret
+; RV32ZVBC64-LABEL: clmulh_nxv2i64_vv:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; RV32ZVBC64-NEXT:    vclmulh.vv v8, v8, v10
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmulh_nxv2i64_vv:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; RV64ZVBC64-NEXT:    vclmulh.vv v8, v8, v10
+; RV64ZVBC64-NEXT:    ret
+;
+; RV32ZVBC32-LABEL: clmulh_nxv2i64_vv:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    addi sp, sp, -352
+; RV32ZVBC32-NEXT:    sw ra, 348(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s0, 344(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s1, 340(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s2, 336(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s3, 332(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s4, 328(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s5, 324(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s6, 320(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s7, 316(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s8, 312(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s9, 308(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s10, 304(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s11, 300(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    sub sp, sp, a0
+; RV32ZVBC32-NEXT:    lui s8, 1044480
+; RV32ZVBC32-NEXT:    lui t1, 524288
+; RV32ZVBC32-NEXT:    li a1, 1
+; RV32ZVBC32-NEXT:    li s11, 2
+; RV32ZVBC32-NEXT:    li ra, 4
+; RV32ZVBC32-NEXT:    li a7, 8
+; RV32ZVBC32-NEXT:    li t0, 16
+; RV32ZVBC32-NEXT:    li a6, 32
+; RV32ZVBC32-NEXT:    li a5, 64
+; RV32ZVBC32-NEXT:    li s10, 128
+; RV32ZVBC32-NEXT:    li a4, 256
+; RV32ZVBC32-NEXT:    li a3, 512
+; RV32ZVBC32-NEXT:    li a2, 1024
+; RV32ZVBC32-NEXT:    lui t3, 1
+; RV32ZVBC32-NEXT:    lui a0, 2
+; RV32ZVBC32-NEXT:    lui s9, 4
+; RV32ZVBC32-NEXT:    lui t2, 8
+; RV32ZVBC32-NEXT:    lui t4, 16
+; RV32ZVBC32-NEXT:    lui t5, 32
+; RV32ZVBC32-NEXT:    lui t6, 64
+; RV32ZVBC32-NEXT:    lui s0, 128
+; RV32ZVBC32-NEXT:    lui s1, 256
+; RV32ZVBC32-NEXT:    lui s2, 512
+; RV32ZVBC32-NEXT:    lui s3, 1024
+; RV32ZVBC32-NEXT:    lui s4, 2048
+; RV32ZVBC32-NEXT:    lui s5, 4096
+; RV32ZVBC32-NEXT:    lui s6, 8192
+; RV32ZVBC32-NEXT:    lui s7, 16384
+; RV32ZVBC32-NEXT:    sw s8, 248(sp)
+; RV32ZVBC32-NEXT:    lui s8, 32768
+; RV32ZVBC32-NEXT:    sw zero, 252(sp)
+; RV32ZVBC32-NEXT:    sw t1, 8(sp)
+; RV32ZVBC32-NEXT:    sw zero, 12(sp)
+; RV32ZVBC32-NEXT:    sw zero, 272(sp)
+; RV32ZVBC32-NEXT:    sw a1, 276(sp)
+; RV32ZVBC32-NEXT:    sw zero, 256(sp)
+; RV32ZVBC32-NEXT:    sw s11, 260(sp)
+; RV32ZVBC32-NEXT:    lui s11, 65536
+; RV32ZVBC32-NEXT:    sw zero, 264(sp)
+; RV32ZVBC32-NEXT:    sw ra, 268(sp)
+; RV32ZVBC32-NEXT:    lui ra, 131072
+; RV32ZVBC32-NEXT:    sw zero, 240(sp)
+; RV32ZVBC32-NEXT:    sw a7, 244(sp)
+; RV32ZVBC32-NEXT:    lui a7, 262144
+; RV32ZVBC32-NEXT:    sw zero, 232(sp)
+; RV32ZVBC32-NEXT:    sw t0, 236(sp)
+; RV32ZVBC32-NEXT:    sw zero, 224(sp)
+; RV32ZVBC32-NEXT:    sw a6, 228(sp)
+; RV32ZVBC32-NEXT:    sw zero, 216(sp)
+; RV32ZVBC32-NEXT:    sw a5, 220(sp)
+; RV32ZVBC32-NEXT:    sw zero, 208(sp)
+; RV32ZVBC32-NEXT:    sw s10, 212(sp)
+; RV32ZVBC32-NEXT:    sw zero, 200(sp)
+; RV32ZVBC32-NEXT:    sw a4, 204(sp)
+; RV32ZVBC32-NEXT:    sw zero, 192(sp)
+; RV32ZVBC32-NEXT:    sw a3, 196(sp)
+; RV32ZVBC32-NEXT:    sw zero, 184(sp)
+; RV32ZVBC32-NEXT:    sw a2, 188(sp)
+; RV32ZVBC32-NEXT:    slli a5, a1, 11
+; RV32ZVBC32-NEXT:    sw zero, 176(sp)
+; RV32ZVBC32-NEXT:    sw a5, 180(sp)
+; RV32ZVBC32-NEXT:    sw zero, 168(sp)
+; RV32ZVBC32-NEXT:    sw t3, 172(sp)
+; RV32ZVBC32-NEXT:    sw zero, 160(sp)
+; RV32ZVBC32-NEXT:    sw a0, 164(sp)
+; RV32ZVBC32-NEXT:    lui a6, 2
+; RV32ZVBC32-NEXT:    sw zero, 152(sp)
+; RV32ZVBC32-NEXT:    sw s9, 156(sp)
+; RV32ZVBC32-NEXT:    lui t0, 4
+; RV32ZVBC32-NEXT:    sw zero, 144(sp)
+; RV32ZVBC32-NEXT:    sw t2, 148(sp)
+; RV32ZVBC32-NEXT:    sw zero, 136(sp)
+; RV32ZVBC32-NEXT:    sw t4, 140(sp)
+; RV32ZVBC32-NEXT:    sw zero, 128(sp)
+; RV32ZVBC32-NEXT:    sw t5, 132(sp)
+; RV32ZVBC32-NEXT:    sw zero, 120(sp)
+; RV32ZVBC32-NEXT:    sw t6, 124(sp)
+; RV32ZVBC32-NEXT:    sw zero, 112(sp)
+; RV32ZVBC32-NEXT:    sw s0, 116(sp)
+; RV32ZVBC32-NEXT:    sw zero, 104(sp)
+; RV32ZVBC32-NEXT:    sw s1, 108(sp)
+; RV32ZVBC32-NEXT:    sw zero, 96(sp)
+; RV32ZVBC32-NEXT:    sw s2, 100(sp)
+; RV32ZVBC32-NEXT:    sw zero, 88(sp)
+; RV32ZVBC32-NEXT:    sw s3, 92(sp)
+; RV32ZVBC32-NEXT:    sw zero, 80(sp)
+; RV32ZVBC32-NEXT:    sw s4, 84(sp)
+; RV32ZVBC32-NEXT:    sw zero, 72(sp)
+; RV32ZVBC32-NEXT:    sw s5, 76(sp)
+; RV32ZVBC32-NEXT:    sw zero, 64(sp)
+; RV32ZVBC32-NEXT:    sw s6, 68(sp)
+; RV32ZVBC32-NEXT:    sw zero, 56(sp)
+; RV32ZVBC32-NEXT:    sw s7, 60(sp)
+; RV32ZVBC32-NEXT:    sw zero, 48(sp)
+; RV32ZVBC32-NEXT:    sw s8, 52(sp)
+; RV32ZVBC32-NEXT:    sw zero, 40(sp)
+; RV32ZVBC32-NEXT:    sw s11, 44(sp)
+; RV32ZVBC32-NEXT:    sw zero, 32(sp)
+; RV32ZVBC32-NEXT:    sw ra, 36(sp)
+; RV32ZVBC32-NEXT:    sw zero, 24(sp)
+; RV32ZVBC32-NEXT:    sw a7, 28(sp)
+; RV32ZVBC32-NEXT:    sw zero, 16(sp)
+; RV32ZVBC32-NEXT:    sw t1, 20(sp)
+; RV32ZVBC32-NEXT:    lui a0, 61681
+; RV32ZVBC32-NEXT:    addi a0, a0, -241
+; RV32ZVBC32-NEXT:    vsetvli t3, zero, e32, m2, ta, ma
+; RV32ZVBC32-NEXT:    vmv.v.x v2, a0
+; RV32ZVBC32-NEXT:    lui a0, 209715
+; RV32ZVBC32-NEXT:    addi a0, a0, 819
+; RV32ZVBC32-NEXT:    vmv.v.x v0, a0
+; RV32ZVBC32-NEXT:    lui a0, 349525
+; RV32ZVBC32-NEXT:    addi a0, a0, 1365
+; RV32ZVBC32-NEXT:    vmv.v.x v12, a0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi a0, sp, 248
+; RV32ZVBC32-NEXT:    vsetvli t3, zero, e64, m2, ta, ma
+; RV32ZVBC32-NEXT:    vlse64.v v4, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 8
+; RV32ZVBC32-NEXT:    vlse64.v v12, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 272
+; RV32ZVBC32-NEXT:    vlse64.v v14, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 256
+; RV32ZVBC32-NEXT:    vlse64.v v16, (a0), zero
+; RV32ZVBC32-NEXT:    li t3, 56
+; RV32ZVBC32-NEXT:    vsrl.vi v18, v8, 24
+; RV32ZVBC32-NEXT:    vsrl.vi v20, v8, 8
+; RV32ZVBC32-NEXT:    vsrl.vx v22, v8, t3
+; RV32ZVBC32-NEXT:    li s9, 40
+; RV32ZVBC32-NEXT:    vsrl.vx v24, v8, s9
+; RV32ZVBC32-NEXT:    vsll.vx v26, v8, t3
+; RV32ZVBC32-NEXT:    vsrl.vx v28, v10, t3
+; RV32ZVBC32-NEXT:    vsrl.vx v30, v10, s9
+; RV32ZVBC32-NEXT:    addi s10, t4, -256
+; RV32ZVBC32-NEXT:    vand.vx v24, v24, s10
+; RV32ZVBC32-NEXT:    vor.vv v22, v24, v22
+; RV32ZVBC32-NEXT:    vsll.vx v6, v10, t3
+; RV32ZVBC32-NEXT:    vand.vx v24, v30, s10
+; RV32ZVBC32-NEXT:    vor.vv v30, v24, v28
+; RV32ZVBC32-NEXT:    vand.vx v24, v8, s10
+; RV32ZVBC32-NEXT:    vsll.vx v24, v24, s9
+; RV32ZVBC32-NEXT:    vor.vv v24, v26, v24
+; RV32ZVBC32-NEXT:    vand.vx v26, v10, s10
+; RV32ZVBC32-NEXT:    vsll.vx v26, v26, s9
+; RV32ZVBC32-NEXT:    vor.vv v26, v6, v26
+; RV32ZVBC32-NEXT:    vsrl.vi v28, v10, 24
+; RV32ZVBC32-NEXT:    lui a4, 4080
+; RV32ZVBC32-NEXT:    vand.vx v18, v18, a4
+; RV32ZVBC32-NEXT:    vand.vv v20, v20, v4
+; RV32ZVBC32-NEXT:    vor.vv v20, v20, v18
+; RV32ZVBC32-NEXT:    vsrl.vi v18, v10, 8
+; RV32ZVBC32-NEXT:    vand.vx v28, v28, a4
+; RV32ZVBC32-NEXT:    vand.vv v18, v18, v4
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v4, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vor.vv v6, v18, v28
+; RV32ZVBC32-NEXT:    addi a3, sp, 264
+; RV32ZVBC32-NEXT:    vlse64.v v18, (a3), zero
+; RV32ZVBC32-NEXT:    vor.vv v28, v20, v22
+; RV32ZVBC32-NEXT:    vand.vx v20, v8, a4
+; RV32ZVBC32-NEXT:    vsll.vi v20, v20, 24
+; RV32ZVBC32-NEXT:    vand.vv v8, v8, v4
+; RV32ZVBC32-NEXT:    vsll.vi v8, v8, 8
+; RV32ZVBC32-NEXT:    vor.vv v8, v20, v8
+; RV32ZVBC32-NEXT:    addi a3, sp, 240
+; RV32ZVBC32-NEXT:    vlse64.v v20, (a3), zero
+; RV32ZVBC32-NEXT:    vor.vv v30, v6, v30
+; RV32ZVBC32-NEXT:    vand.vx v22, v10, a4
+; RV32ZVBC32-NEXT:    vsll.vi v22, v22, 24
+; RV32ZVBC32-NEXT:    vand.vv v10, v10, v4
+; RV32ZVBC32-NEXT:    vsll.vi v10, v10, 8
+; RV32ZVBC32-NEXT:    vor.vv v10, v22, v10
+; RV32ZVBC32-NEXT:    addi a3, sp, 232
+; RV32ZVBC32-NEXT:    vlse64.v v22, (a3), zero
+; RV32ZVBC32-NEXT:    vor.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    addi a3, sp, 224
+; RV32ZVBC32-NEXT:    vlse64.v v24, (a3), zero
+; RV32ZVBC32-NEXT:    vor.vv v10, v26, v10
+; RV32ZVBC32-NEXT:    addi a3, sp, 216
+; RV32ZVBC32-NEXT:    vlse64.v v26, (a3), zero
+; RV32ZVBC32-NEXT:    vor.vv v8, v8, v28
+; RV32ZVBC32-NEXT:    addi a3, sp, 208
+; RV32ZVBC32-NEXT:    vlse64.v v28, (a3), zero
+; RV32ZVBC32-NEXT:    vor.vv v10, v10, v30
+; RV32ZVBC32-NEXT:    vsrl.vi v30, v8, 4
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v2, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v8, v8, v2
+; RV32ZVBC32-NEXT:    vand.vv v30, v30, v2
+; RV32ZVBC32-NEXT:    vsll.vi v8, v8, 4
+; RV32ZVBC32-NEXT:    vor.vv v8, v30, v8
+; RV32ZVBC32-NEXT:    vsrl.vi v30, v10, 4
+; RV32ZVBC32-NEXT:    vand.vv v10, v10, v2
+; RV32ZVBC32-NEXT:    vand.vv v30, v30, v2
+; RV32ZVBC32-NEXT:    vsll.vi v10, v10, 4
+; RV32ZVBC32-NEXT:    vor.vv v10, v30, v10
+; RV32ZVBC32-NEXT:    vsrl.vi v30, v8, 2
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v0, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v8, v8, v0
+; RV32ZVBC32-NEXT:    vand.vv v30, v30, v0
+; RV32ZVBC32-NEXT:    vsll.vi v8, v8, 2
+; RV32ZVBC32-NEXT:    vor.vv v8, v30, v8
+; RV32ZVBC32-NEXT:    vsrl.vi v30, v10, 2
+; RV32ZVBC32-NEXT:    vand.vv v10, v10, v0
+; RV32ZVBC32-NEXT:    vand.vv v30, v30, v0
+; RV32ZVBC32-NEXT:    vsll.vi v10, v10, 2
+; RV32ZVBC32-NEXT:    vor.vv v30, v30, v10
+; RV32ZVBC32-NEXT:    vsrl.vi v10, v8, 1
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v6, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v8, v8, v6
+; RV32ZVBC32-NEXT:    vand.vv v10, v10, v6
+; RV32ZVBC32-NEXT:    vadd.vv v8, v8, v8
+; RV32ZVBC32-NEXT:    vor.vv v10, v10, v8
+; RV32ZVBC32-NEXT:    vsrl.vi v8, v30, 1
+; RV32ZVBC32-NEXT:    vand.vv v30, v30, v6
+; RV32ZVBC32-NEXT:    vand.vv v8, v8, v6
+; RV32ZVBC32-NEXT:    vadd.vv v30, v30, v30
+; RV32ZVBC32-NEXT:    vor.vv v8, v8, v30
+; RV32ZVBC32-NEXT:    addi a3, sp, 200
+; RV32ZVBC32-NEXT:    vlse64.v v30, (a3), zero
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v14
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v18
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v20
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v22
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v26
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v28
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi a3, sp, 192
+; RV32ZVBC32-NEXT:    addi a1, sp, 184
+; RV32ZVBC32-NEXT:    addi a0, sp, 176
+; RV32ZVBC32-NEXT:    vlse64.v v12, (a3), zero
+; RV32ZVBC32-NEXT:    vlse64.v v14, (a1), zero
+; RV32ZVBC32-NEXT:    vlse64.v v16, (a0), zero
+; RV32ZVBC32-NEXT:    vand.vv v18, v8, v30
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v18, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v14
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi a2, sp, 168
+; RV32ZVBC32-NEXT:    addi a1, sp, 160
+; RV32ZVBC32-NEXT:    addi a3, sp, 152
+; RV32ZVBC32-NEXT:    addi a0, sp, 144
+; RV32ZVBC32-NEXT:    vlse64.v v12, (a2), zero
+; RV32ZVBC32-NEXT:    vlse64.v v14, (a1), zero
+; RV32ZVBC32-NEXT:    vlse64.v v16, (a3), zero
+; RV32ZVBC32-NEXT:    vlse64.v v18, (a0), zero
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v14
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v18
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi a0, sp, 136
+; RV32ZVBC32-NEXT:    addi a1, sp, 128
+; RV32ZVBC32-NEXT:    addi a2, sp, 120
+; RV32ZVBC32-NEXT:    addi a3, sp, 112
+; RV32ZVBC32-NEXT:    vlse64.v v12, (a0), zero
+; RV32ZVBC32-NEXT:    vlse64.v v14, (a1), zero
+; RV32ZVBC32-NEXT:    vlse64.v v16, (a2), zero
+; RV32ZVBC32-NEXT:    vlse64.v v18, (a3), zero
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v14
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v18
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi a0, sp, 104
+; RV32ZVBC32-NEXT:    addi a1, sp, 96
+; RV32ZVBC32-NEXT:    addi a2, sp, 88
+; RV32ZVBC32-NEXT:    addi a3, sp, 80
+; RV32ZVBC32-NEXT:    vlse64.v v12, (a0), zero
+; RV32ZVBC32-NEXT:    vlse64.v v14, (a1), zero
+; RV32ZVBC32-NEXT:    vlse64.v v16, (a2), zero
+; RV32ZVBC32-NEXT:    vlse64.v v18, (a3), zero
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v14
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v18
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi a0, sp, 72
+; RV32ZVBC32-NEXT:    addi a1, sp, 64
+; RV32ZVBC32-NEXT:    addi a2, sp, 56
+; RV32ZVBC32-NEXT:    addi a3, sp, 48
+; RV32ZVBC32-NEXT:    vlse64.v v12, (a0), zero
+; RV32ZVBC32-NEXT:    vlse64.v v14, (a1), zero
+; RV32ZVBC32-NEXT:    vlse64.v v16, (a2), zero
+; RV32ZVBC32-NEXT:    vlse64.v v18, (a3), zero
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v14
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v18
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi a0, sp, 40
+; RV32ZVBC32-NEXT:    addi a1, sp, 32
+; RV32ZVBC32-NEXT:    addi a2, sp, 24
+; RV32ZVBC32-NEXT:    addi a3, sp, 16
+; RV32ZVBC32-NEXT:    vlse64.v v12, (a0), zero
+; RV32ZVBC32-NEXT:    vlse64.v v14, (a1), zero
+; RV32ZVBC32-NEXT:    vlse64.v v16, (a2), zero
+; RV32ZVBC32-NEXT:    vlse64.v v18, (a3), zero
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v14
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v18
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    li a0, 16
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, a0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    li a0, 32
+; RV32ZVBC32-NEXT:    vand.vx v18, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 64
+; RV32ZVBC32-NEXT:    vand.vx v20, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 128
+; RV32ZVBC32-NEXT:    vand.vx v22, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 256
+; RV32ZVBC32-NEXT:    vand.vx v24, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 512
+; RV32ZVBC32-NEXT:    vand.vx v26, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 1024
+; RV32ZVBC32-NEXT:    vand.vx v28, v8, a0
+; RV32ZVBC32-NEXT:    vand.vx v30, v8, a5
+; RV32ZVBC32-NEXT:    lui a0, 1
+; RV32ZVBC32-NEXT:    vand.vx v6, v8, a0
+; RV32ZVBC32-NEXT:    vand.vx v4, v8, a6
+; RV32ZVBC32-NEXT:    vand.vx v2, v8, t0
+; RV32ZVBC32-NEXT:    vand.vx v0, v8, t2
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, t4
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, t5
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, t6
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, s0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, s1
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, s2
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, s3
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, s4
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, s5
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, s6
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, s7
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, s8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, s11
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, ra
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, a7
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vi v12, v8, 2
+; RV32ZVBC32-NEXT:    vand.vi v14, v8, 1
+; RV32ZVBC32-NEXT:    vand.vi v16, v8, 4
+; RV32ZVBC32-NEXT:    vand.vi v8, v8, 8
+; RV32ZVBC32-NEXT:    vmul.vv v12, v10, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vmul.vv v12, v10, v14
+; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v16
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v16, v10, v8
+; RV32ZVBC32-NEXT:    vmul.vv v18, v10, v18
+; RV32ZVBC32-NEXT:    vmul.vv v20, v10, v20
+; RV32ZVBC32-NEXT:    vmul.vv v22, v10, v22
+; RV32ZVBC32-NEXT:    vmul.vv v24, v10, v24
+; RV32ZVBC32-NEXT:    vmul.vv v26, v10, v26
+; RV32ZVBC32-NEXT:    vmul.vv v28, v10, v28
+; RV32ZVBC32-NEXT:    vmul.vv v30, v10, v30
+; RV32ZVBC32-NEXT:    vmul.vv v6, v10, v6
+; RV32ZVBC32-NEXT:    vmul.vv v4, v10, v4
+; RV32ZVBC32-NEXT:    vmul.vv v2, v10, v2
+; RV32ZVBC32-NEXT:    vmul.vv v0, v10, v0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    addi a0, sp, 288
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v10, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v12, v8
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v12
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v18
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v20
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v22
+; RV32ZVBC32-NEXT:    vxor.vv v12, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v26
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v28
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v30
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v6
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v4
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v2
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v14, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v14, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v14, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v14, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v14, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v14, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v14, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v14, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v14, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v14, v12, v14
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v16, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v14, v14, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v16, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v14, v14, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v16, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v14, v14, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v16, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v14, v14, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v16, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v14, v14, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v16, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v14, v14, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v16, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v14, v14, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v16, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v14, v14, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v16, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v14, v14, v16
+; RV32ZVBC32-NEXT:    addi a0, sp, 288
+; RV32ZVBC32-NEXT:    vl2r.v v16, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v14, v14, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v16, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v14, v14, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v16, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v14, v14, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v16, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v14, v14, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v16, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v14, v14, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v16, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v14, v14, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v16, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v14, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v18, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v18
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v18, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v18
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v18, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v18
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v18, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v18
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v18, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v18
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v18, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v18
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v18, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v18
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v18, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v18, v16, v18
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v20, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v18, v18, v20
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v20, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v18, v18, v20
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v20, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v18, v18, v20
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v20, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v18, v18, v20
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v20, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v18, v18, v20
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v20, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v18, v18, v20
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v20, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v18, v18, v20
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v20, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v20, v18, v20
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v22, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v20, v20, v22
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v22, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v20, v20, v22
+; RV32ZVBC32-NEXT:    vand.vx v22, v14, a4
+; RV32ZVBC32-NEXT:    vsrl.vi v16, v16, 24
+; RV32ZVBC32-NEXT:    vand.vx v16, v16, a4
+; RV32ZVBC32-NEXT:    vand.vx v12, v12, s10
+; RV32ZVBC32-NEXT:    vsll.vx v12, v12, s9
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v24, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v20, v20, v24
+; RV32ZVBC32-NEXT:    vsrl.vx v24, v18, s9
+; RV32ZVBC32-NEXT:    vand.vx v24, v24, s10
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v26, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v20, v20, v26
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v26, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v20, v20, v26
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v26, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v20, v20, v26
+; RV32ZVBC32-NEXT:    vxor.vv v10, v20, v10
+; RV32ZVBC32-NEXT:    vsll.vx v8, v8, t3
+; RV32ZVBC32-NEXT:    vsrl.vx v10, v10, t3
+; RV32ZVBC32-NEXT:    vor.vv v8, v8, v12
+; RV32ZVBC32-NEXT:    vsrl.vi v12, v14, 8
+; RV32ZVBC32-NEXT:    vsll.vi v14, v22, 24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v20, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v12, v12, v20
+; RV32ZVBC32-NEXT:    vor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    vand.vv v16, v18, v20
+; RV32ZVBC32-NEXT:    vsll.vi v16, v16, 8
+; RV32ZVBC32-NEXT:    vor.vv v14, v14, v16
+; RV32ZVBC32-NEXT:    vor.vv v8, v8, v14
+; RV32ZVBC32-NEXT:    vor.vv v10, v24, v10
+; RV32ZVBC32-NEXT:    vor.vv v10, v12, v10
+; RV32ZVBC32-NEXT:    vor.vv v8, v8, v10
+; RV32ZVBC32-NEXT:    vsrl.vi v10, v8, 4
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v8, v8, v12
+; RV32ZVBC32-NEXT:    vand.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    vsll.vi v8, v8, 4
+; RV32ZVBC32-NEXT:    vor.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    vsrl.vi v10, v8, 2
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v8, v8, v12
+; RV32ZVBC32-NEXT:    vand.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    vsll.vi v8, v8, 2
+; RV32ZVBC32-NEXT:    vor.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    vsrl.vi v10, v8, 1
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v8, v8, v12
+; RV32ZVBC32-NEXT:    vand.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    vadd.vv v8, v8, v8
+; RV32ZVBC32-NEXT:    vor.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    vsrl.vi v8, v8, 1
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add sp, sp, a0
+; RV32ZVBC32-NEXT:    lw ra, 348(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s0, 344(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s1, 340(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s2, 336(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s3, 332(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s4, 328(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s5, 324(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s6, 320(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s7, 316(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s8, 312(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s9, 308(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s10, 304(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s11, 300(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    addi sp, sp, 352
+; RV32ZVBC32-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmulh_nxv2i64_vv:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV64ZVBC-NEXT:    vclmulh.vv v8, v8, v10
-; RV64ZVBC-NEXT:    ret
+; RV64ZVBC32-LABEL: clmulh_nxv2i64_vv:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    addi sp, sp, -320
+; RV64ZVBC32-NEXT:    sd ra, 312(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    sd s0, 304(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    sd s1, 296(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    sd s2, 288(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    sd s3, 280(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    sd s4, 272(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    sd s5, 264(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    sd s6, 256(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    sd s7, 248(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    sd s8, 240(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    sd s9, 232(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    sd s10, 224(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    sd s11, 216(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 6
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    sub sp, sp, a0
+; RV64ZVBC32-NEXT:    lui a1, 16
+; RV64ZVBC32-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; RV64ZVBC32-NEXT:    vsrl.vi v16, v8, 24
+; RV64ZVBC32-NEXT:    vsrl.vi v12, v8, 8
+; RV64ZVBC32-NEXT:    li a4, 255
+; RV64ZVBC32-NEXT:    lui a0, 61681
+; RV64ZVBC32-NEXT:    lui a2, 209715
+; RV64ZVBC32-NEXT:    lui a3, 349525
+; RV64ZVBC32-NEXT:    vsrl.vi v18, v10, 24
+; RV64ZVBC32-NEXT:    vsrl.vi v14, v10, 8
+; RV64ZVBC32-NEXT:    li a5, 16
+; RV64ZVBC32-NEXT:    li a6, 32
+; RV64ZVBC32-NEXT:    li a7, 64
+; RV64ZVBC32-NEXT:    li t0, 128
+; RV64ZVBC32-NEXT:    li t2, 256
+; RV64ZVBC32-NEXT:    li t3, 512
+; RV64ZVBC32-NEXT:    li t4, 1024
+; RV64ZVBC32-NEXT:    li t1, 1
+; RV64ZVBC32-NEXT:    lui s4, 1
+; RV64ZVBC32-NEXT:    lui s3, 2
+; RV64ZVBC32-NEXT:    lui s5, 4
+; RV64ZVBC32-NEXT:    lui t5, 8
+; RV64ZVBC32-NEXT:    lui s0, 32
+; RV64ZVBC32-NEXT:    li s6, 56
+; RV64ZVBC32-NEXT:    vsrl.vx v30, v8, s6
+; RV64ZVBC32-NEXT:    li s7, 40
+; RV64ZVBC32-NEXT:    vsrl.vx v0, v8, s7
+; RV64ZVBC32-NEXT:    addi s9, a1, -256
+; RV64ZVBC32-NEXT:    lui s8, 4080
+; RV64ZVBC32-NEXT:    vand.vx v2, v16, s8
+; RV64ZVBC32-NEXT:    slli a4, a4, 24
+; RV64ZVBC32-NEXT:    vand.vx v4, v8, s8
+; RV64ZVBC32-NEXT:    vsll.vx v6, v8, s6
+; RV64ZVBC32-NEXT:    addi t6, a0, -241
+; RV64ZVBC32-NEXT:    addi s1, a2, 819
+; RV64ZVBC32-NEXT:    addi s2, a3, 1365
+; RV64ZVBC32-NEXT:    vsrl.vx v20, v10, s6
+; RV64ZVBC32-NEXT:    vsrl.vx v22, v10, s7
+; RV64ZVBC32-NEXT:    vand.vx v18, v18, s8
+; RV64ZVBC32-NEXT:    vand.vx v24, v10, s8
+; RV64ZVBC32-NEXT:    vsll.vx v16, v10, s6
+; RV64ZVBC32-NEXT:    slli s11, t1, 11
+; RV64ZVBC32-NEXT:    slli a0, t1, 31
+; RV64ZVBC32-NEXT:    sd a0, 152(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli a0, t1, 32
+; RV64ZVBC32-NEXT:    sd a0, 144(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli a0, t1, 33
+; RV64ZVBC32-NEXT:    sd a0, 136(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli a0, t1, 34
+; RV64ZVBC32-NEXT:    sd a0, 128(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli a0, t6, 32
+; RV64ZVBC32-NEXT:    add t6, t6, a0
+; RV64ZVBC32-NEXT:    slli a0, s1, 32
+; RV64ZVBC32-NEXT:    add s1, s1, a0
+; RV64ZVBC32-NEXT:    slli a0, s2, 32
+; RV64ZVBC32-NEXT:    add s2, s2, a0
+; RV64ZVBC32-NEXT:    slli a0, t1, 35
+; RV64ZVBC32-NEXT:    sd a0, 120(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v22, v22, s9
+; RV64ZVBC32-NEXT:    vand.vx v14, v14, a4
+; RV64ZVBC32-NEXT:    vsll.vi v24, v24, 24
+; RV64ZVBC32-NEXT:    vand.vx v26, v10, a4
+; RV64ZVBC32-NEXT:    vand.vx v10, v10, s9
+; RV64ZVBC32-NEXT:    vor.vv v20, v22, v20
+; RV64ZVBC32-NEXT:    vor.vv v14, v14, v18
+; RV64ZVBC32-NEXT:    vsll.vi v18, v26, 8
+; RV64ZVBC32-NEXT:    li a0, 40
+; RV64ZVBC32-NEXT:    vsll.vx v10, v10, a0
+; RV64ZVBC32-NEXT:    vor.vv v14, v14, v20
+; RV64ZVBC32-NEXT:    vor.vv v18, v24, v18
+; RV64ZVBC32-NEXT:    vor.vv v10, v16, v10
+; RV64ZVBC32-NEXT:    vor.vv v10, v10, v18
+; RV64ZVBC32-NEXT:    vor.vv v10, v10, v14
+; RV64ZVBC32-NEXT:    vsrl.vi v14, v10, 4
+; RV64ZVBC32-NEXT:    vand.vx v10, v10, t6
+; RV64ZVBC32-NEXT:    vand.vx v14, v14, t6
+; RV64ZVBC32-NEXT:    vsll.vi v10, v10, 4
+; RV64ZVBC32-NEXT:    vor.vv v10, v14, v10
+; RV64ZVBC32-NEXT:    vsrl.vi v14, v10, 2
+; RV64ZVBC32-NEXT:    vand.vx v10, v10, s1
+; RV64ZVBC32-NEXT:    vand.vx v14, v14, s1
+; RV64ZVBC32-NEXT:    vsll.vi v10, v10, 2
+; RV64ZVBC32-NEXT:    vor.vv v10, v14, v10
+; RV64ZVBC32-NEXT:    vsrl.vi v14, v10, 1
+; RV64ZVBC32-NEXT:    vand.vx v10, v10, s2
+; RV64ZVBC32-NEXT:    vand.vx v14, v14, s2
+; RV64ZVBC32-NEXT:    vadd.vv v10, v10, v10
+; RV64ZVBC32-NEXT:    vor.vv v14, v14, v10
+; RV64ZVBC32-NEXT:    vand.vx v20, v14, a5
+; RV64ZVBC32-NEXT:    slli a2, t1, 36
+; RV64ZVBC32-NEXT:    sd a2, 112(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v22, v14, a6
+; RV64ZVBC32-NEXT:    slli a2, t1, 37
+; RV64ZVBC32-NEXT:    sd a2, 104(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v24, v14, a7
+; RV64ZVBC32-NEXT:    slli a2, t1, 38
+; RV64ZVBC32-NEXT:    sd a2, 96(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v26, v14, t0
+; RV64ZVBC32-NEXT:    slli a2, t1, 39
+; RV64ZVBC32-NEXT:    sd a2, 88(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v28, v14, t2
+; RV64ZVBC32-NEXT:    slli a2, t1, 40
+; RV64ZVBC32-NEXT:    sd a2, 80(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v10, v14, t3
+; RV64ZVBC32-NEXT:    csrr a2, vlenb
+; RV64ZVBC32-NEXT:    slli a2, a2, 7
+; RV64ZVBC32-NEXT:    add a2, sp, a2
+; RV64ZVBC32-NEXT:    addi a2, a2, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a2) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli a2, t1, 41
+; RV64ZVBC32-NEXT:    sd a2, 72(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v10, v14, t4
+; RV64ZVBC32-NEXT:    csrr a2, vlenb
+; RV64ZVBC32-NEXT:    slli a2, a2, 1
+; RV64ZVBC32-NEXT:    mv a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 1
+; RV64ZVBC32-NEXT:    add a2, a2, a3
+; RV64ZVBC32-NEXT:    add a2, sp, a2
+; RV64ZVBC32-NEXT:    addi a2, a2, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a2) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli a2, t1, 42
+; RV64ZVBC32-NEXT:    sd a2, 64(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v10, v14, s11
+; RV64ZVBC32-NEXT:    csrr a2, vlenb
+; RV64ZVBC32-NEXT:    slli a2, a2, 2
+; RV64ZVBC32-NEXT:    mv a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 1
+; RV64ZVBC32-NEXT:    add a2, a2, a3
+; RV64ZVBC32-NEXT:    add a2, sp, a2
+; RV64ZVBC32-NEXT:    addi a2, a2, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a2) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli a2, t1, 43
+; RV64ZVBC32-NEXT:    sd a2, 56(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    sd s9, 160(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v10, v0, s9
+; RV64ZVBC32-NEXT:    vor.vv v10, v10, v30
+; RV64ZVBC32-NEXT:    vand.vx v0, v14, s4
+; RV64ZVBC32-NEXT:    slli a2, t1, 44
+; RV64ZVBC32-NEXT:    sd a2, 48(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    sd a4, 168(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v12, v12, a4
+; RV64ZVBC32-NEXT:    vor.vv v12, v12, v2
+; RV64ZVBC32-NEXT:    vand.vx v2, v14, s3
+; RV64ZVBC32-NEXT:    slli a2, t1, 45
+; RV64ZVBC32-NEXT:    sd a2, 40(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vsll.vi v16, v4, 24
+; RV64ZVBC32-NEXT:    vor.vv v10, v12, v10
+; RV64ZVBC32-NEXT:    vand.vx v12, v8, a4
+; RV64ZVBC32-NEXT:    vsll.vi v12, v12, 8
+; RV64ZVBC32-NEXT:    vor.vv v12, v16, v12
+; RV64ZVBC32-NEXT:    vand.vx v16, v14, s5
+; RV64ZVBC32-NEXT:    csrr a2, vlenb
+; RV64ZVBC32-NEXT:    slli a2, a2, 1
+; RV64ZVBC32-NEXT:    mv a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 2
+; RV64ZVBC32-NEXT:    add a3, a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 1
+; RV64ZVBC32-NEXT:    add a2, a2, a3
+; RV64ZVBC32-NEXT:    add a2, sp, a2
+; RV64ZVBC32-NEXT:    addi a2, a2, 208
+; RV64ZVBC32-NEXT:    vs2r.v v16, (a2) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli a2, t1, 46
+; RV64ZVBC32-NEXT:    sd a2, 32(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, s9
+; RV64ZVBC32-NEXT:    vsll.vx v8, v8, a0
+; RV64ZVBC32-NEXT:    vor.vv v8, v6, v8
+; RV64ZVBC32-NEXT:    vand.vx v16, v14, t5
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a2, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a2, a2, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a2, a2, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a2
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v16, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli a0, t1, 47
+; RV64ZVBC32-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vor.vv v8, v8, v12
+; RV64ZVBC32-NEXT:    vand.vx v12, v14, a1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli a0, t1, 48
+; RV64ZVBC32-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vor.vv v8, v8, v10
+; RV64ZVBC32-NEXT:    vsrl.vi v10, v8, 4
+; RV64ZVBC32-NEXT:    sd t6, 176(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, t6
+; RV64ZVBC32-NEXT:    vand.vx v10, v10, t6
+; RV64ZVBC32-NEXT:    vsll.vi v8, v8, 4
+; RV64ZVBC32-NEXT:    vor.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    vsrl.vi v10, v8, 2
+; RV64ZVBC32-NEXT:    sd s1, 184(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, s1
+; RV64ZVBC32-NEXT:    vand.vx v10, v10, s1
+; RV64ZVBC32-NEXT:    vsll.vi v8, v8, 2
+; RV64ZVBC32-NEXT:    vor.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    vsrl.vi v10, v8, 1
+; RV64ZVBC32-NEXT:    sd s2, 192(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, s2
+; RV64ZVBC32-NEXT:    vand.vx v10, v10, s2
+; RV64ZVBC32-NEXT:    vadd.vv v8, v8, v8
+; RV64ZVBC32-NEXT:    vor.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    vand.vx v10, v14, s0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli ra, t1, 49
+; RV64ZVBC32-NEXT:    slli s11, t1, 50
+; RV64ZVBC32-NEXT:    slli s10, t1, 51
+; RV64ZVBC32-NEXT:    slli s9, t1, 52
+; RV64ZVBC32-NEXT:    slli s8, t1, 53
+; RV64ZVBC32-NEXT:    slli s7, t1, 54
+; RV64ZVBC32-NEXT:    slli s6, t1, 55
+; RV64ZVBC32-NEXT:    slli s5, t1, 56
+; RV64ZVBC32-NEXT:    slli s4, t1, 57
+; RV64ZVBC32-NEXT:    slli s3, t1, 58
+; RV64ZVBC32-NEXT:    slli s2, t1, 59
+; RV64ZVBC32-NEXT:    slli t6, t1, 60
+; RV64ZVBC32-NEXT:    slli s1, t1, 61
+; RV64ZVBC32-NEXT:    slli s0, t1, 62
+; RV64ZVBC32-NEXT:    li a0, -1
+; RV64ZVBC32-NEXT:    slli t5, a0, 63
+; RV64ZVBC32-NEXT:    lui a0, 64
+; RV64ZVBC32-NEXT:    lui a1, 128
+; RV64ZVBC32-NEXT:    lui a2, 256
+; RV64ZVBC32-NEXT:    lui a3, 512
+; RV64ZVBC32-NEXT:    lui a4, 1024
+; RV64ZVBC32-NEXT:    lui a5, 2048
+; RV64ZVBC32-NEXT:    lui a6, 4096
+; RV64ZVBC32-NEXT:    lui a7, 8192
+; RV64ZVBC32-NEXT:    lui t0, 16384
+; RV64ZVBC32-NEXT:    lui t1, 32768
+; RV64ZVBC32-NEXT:    lui t2, 65536
+; RV64ZVBC32-NEXT:    lui t3, 131072
+; RV64ZVBC32-NEXT:    lui t4, 262144
+; RV64ZVBC32-NEXT:    vand.vx v10, v14, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    sd s6, 8(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    mv s6, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add s6, s6, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add s6, s6, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, s6
+; RV64ZVBC32-NEXT:    ld s6, 8(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v10, v14, a1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v10, v14, a2
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v10, v14, a3
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v10, v14, a4
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v10, v14, a5
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v10, v14, a6
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v10, v14, a7
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v10, v14, t0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v10, v14, t1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 5
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v10, v14, t2
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v10, v14, t3
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v10, v14, t4
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld a0, 152(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v10, v14, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld a0, 144(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v10, v14, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld a0, 136(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v10, v14, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld a0, 128(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v10, v14, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld a0, 120(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v10, v14, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld a0, 112(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v10, v14, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld a0, 104(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v10, v14, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld a0, 96(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v10, v14, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld a0, 88(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v10, v14, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld a0, 80(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v10, v14, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld a0, 72(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v10, v14, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld a0, 64(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v10, v14, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 5
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld a0, 56(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v10, v14, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 6
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v10, v14, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v10, v14, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v10, v14, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v10, v14, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v10, v14, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v10, v14, ra
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v10, v14, s11
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v10, v14, s10
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v10, v14, s9
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v10, v14, s8
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v10, v14, s7
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v10, v14, s6
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v10, v14, s5
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v10, v14, s4
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v10, v14, s3
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v10, v14, s2
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 5
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v10, v14, t6
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vi v10, v14, 2
+; RV64ZVBC32-NEXT:    vand.vi v12, v14, 1
+; RV64ZVBC32-NEXT:    vand.vi v16, v14, 4
+; RV64ZVBC32-NEXT:    vand.vi v18, v14, 8
+; RV64ZVBC32-NEXT:    vand.vx v30, v14, s1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v30, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v30, v14, s0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v30, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v14, v14, t5
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v14, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v12
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    vmul.vv v12, v8, v16
+; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v18
+; RV64ZVBC32-NEXT:    vmul.vv v16, v8, v20
+; RV64ZVBC32-NEXT:    vmul.vv v18, v8, v22
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v24
+; RV64ZVBC32-NEXT:    vmul.vv v22, v8, v26
+; RV64ZVBC32-NEXT:    vmul.vv v24, v8, v28
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 7
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vl2r.v v26, (a0) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v26, v8, v26
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vl2r.v v28, (a0) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v28, v8, v28
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vl2r.v v30, (a0) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v30, v8, v30
+; RV64ZVBC32-NEXT:    vmul.vv v6, v8, v0
+; RV64ZVBC32-NEXT:    vmul.vv v4, v8, v2
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vl2r.v v2, (a0) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v2, v8, v2
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vl2r.v v0, (a0) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV64ZVBC32-NEXT:    addi a0, sp, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 5
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 5
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 5
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 6
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 5
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 6
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 5
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 7
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v8, v8, v10
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v10, v8, v10
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v14
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v16
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v18
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v20
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v22
+; RV64ZVBC32-NEXT:    vxor.vv v12, v10, v24
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v26
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v28
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v30
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v6
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v4
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v2
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v8
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v8
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v8
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v8
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v8
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v8
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v8
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v8
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 208
+; RV64ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v14, v12, v8
+; RV64ZVBC32-NEXT:    addi a0, sp, 208
+; RV64ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v14, v14, v8
+; RV64ZVBC32-NEXT:    li a0, 56
+; RV64ZVBC32-NEXT:    vsll.vx v10, v10, a0
+; RV64ZVBC32-NEXT:    ld a2, 160(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v12, v12, a2
+; RV64ZVBC32-NEXT:    li a4, 40
+; RV64ZVBC32-NEXT:    vsll.vx v12, v12, a4
+; RV64ZVBC32-NEXT:    vor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    csrr a1, vlenb
+; RV64ZVBC32-NEXT:    slli a1, a1, 3
+; RV64ZVBC32-NEXT:    mv a3, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 2
+; RV64ZVBC32-NEXT:    add a3, a3, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a3
+; RV64ZVBC32-NEXT:    add a1, sp, a1
+; RV64ZVBC32-NEXT:    addi a1, a1, 208
+; RV64ZVBC32-NEXT:    vl2r.v v12, (a1) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v12, v14, v12
+; RV64ZVBC32-NEXT:    csrr a1, vlenb
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    mv a3, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 3
+; RV64ZVBC32-NEXT:    add a3, a3, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a3
+; RV64ZVBC32-NEXT:    add a1, sp, a1
+; RV64ZVBC32-NEXT:    addi a1, a1, 208
+; RV64ZVBC32-NEXT:    vl2r.v v14, (a1) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    csrr a1, vlenb
+; RV64ZVBC32-NEXT:    slli a1, a1, 2
+; RV64ZVBC32-NEXT:    mv a3, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 3
+; RV64ZVBC32-NEXT:    add a3, a3, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a3
+; RV64ZVBC32-NEXT:    add a1, sp, a1
+; RV64ZVBC32-NEXT:    addi a1, a1, 208
+; RV64ZVBC32-NEXT:    vl2r.v v14, (a1) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    csrr a1, vlenb
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    mv a3, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 4
+; RV64ZVBC32-NEXT:    add a3, a3, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a3
+; RV64ZVBC32-NEXT:    add a1, sp, a1
+; RV64ZVBC32-NEXT:    addi a1, a1, 208
+; RV64ZVBC32-NEXT:    vl2r.v v14, (a1) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    csrr a1, vlenb
+; RV64ZVBC32-NEXT:    slli a1, a1, 5
+; RV64ZVBC32-NEXT:    mv a3, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a3
+; RV64ZVBC32-NEXT:    add a1, sp, a1
+; RV64ZVBC32-NEXT:    addi a1, a1, 208
+; RV64ZVBC32-NEXT:    vl2r.v v14, (a1) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    csrr a1, vlenb
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    mv a3, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a3
+; RV64ZVBC32-NEXT:    add a1, sp, a1
+; RV64ZVBC32-NEXT:    addi a1, a1, 208
+; RV64ZVBC32-NEXT:    vl2r.v v14, (a1) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    csrr a1, vlenb
+; RV64ZVBC32-NEXT:    slli a1, a1, 2
+; RV64ZVBC32-NEXT:    mv a3, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a3
+; RV64ZVBC32-NEXT:    add a1, sp, a1
+; RV64ZVBC32-NEXT:    addi a1, a1, 208
+; RV64ZVBC32-NEXT:    vl2r.v v14, (a1) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    csrr a1, vlenb
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    mv a3, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 2
+; RV64ZVBC32-NEXT:    add a3, a3, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a3
+; RV64ZVBC32-NEXT:    add a1, sp, a1
+; RV64ZVBC32-NEXT:    addi a1, a1, 208
+; RV64ZVBC32-NEXT:    vl2r.v v14, (a1) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    csrr a1, vlenb
+; RV64ZVBC32-NEXT:    slli a1, a1, 3
+; RV64ZVBC32-NEXT:    mv a3, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a3
+; RV64ZVBC32-NEXT:    add a1, sp, a1
+; RV64ZVBC32-NEXT:    addi a1, a1, 208
+; RV64ZVBC32-NEXT:    vl2r.v v14, (a1) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    csrr a1, vlenb
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    mv a3, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 2
+; RV64ZVBC32-NEXT:    add a3, a3, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a3
+; RV64ZVBC32-NEXT:    add a1, sp, a1
+; RV64ZVBC32-NEXT:    addi a1, a1, 208
+; RV64ZVBC32-NEXT:    vl2r.v v14, (a1) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    csrr a1, vlenb
+; RV64ZVBC32-NEXT:    slli a1, a1, 2
+; RV64ZVBC32-NEXT:    mv a3, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 2
+; RV64ZVBC32-NEXT:    add a3, a3, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a3
+; RV64ZVBC32-NEXT:    add a1, sp, a1
+; RV64ZVBC32-NEXT:    addi a1, a1, 208
+; RV64ZVBC32-NEXT:    vl2r.v v14, (a1) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    csrr a1, vlenb
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    mv a3, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 3
+; RV64ZVBC32-NEXT:    add a3, a3, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a3
+; RV64ZVBC32-NEXT:    add a1, sp, a1
+; RV64ZVBC32-NEXT:    addi a1, a1, 208
+; RV64ZVBC32-NEXT:    vl2r.v v14, (a1) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    csrr a1, vlenb
+; RV64ZVBC32-NEXT:    slli a1, a1, 4
+; RV64ZVBC32-NEXT:    mv a3, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a3
+; RV64ZVBC32-NEXT:    add a1, sp, a1
+; RV64ZVBC32-NEXT:    addi a1, a1, 208
+; RV64ZVBC32-NEXT:    vl2r.v v14, (a1) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    csrr a1, vlenb
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    mv a3, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 3
+; RV64ZVBC32-NEXT:    add a1, a1, a3
+; RV64ZVBC32-NEXT:    add a1, sp, a1
+; RV64ZVBC32-NEXT:    addi a1, a1, 208
+; RV64ZVBC32-NEXT:    vl2r.v v14, (a1) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    csrr a1, vlenb
+; RV64ZVBC32-NEXT:    slli a1, a1, 2
+; RV64ZVBC32-NEXT:    mv a3, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 3
+; RV64ZVBC32-NEXT:    add a1, a1, a3
+; RV64ZVBC32-NEXT:    add a1, sp, a1
+; RV64ZVBC32-NEXT:    addi a1, a1, 208
+; RV64ZVBC32-NEXT:    vl2r.v v14, (a1) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v14, v12, v14
+; RV64ZVBC32-NEXT:    csrr a1, vlenb
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    mv a3, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 2
+; RV64ZVBC32-NEXT:    add a3, a3, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 3
+; RV64ZVBC32-NEXT:    add a1, a1, a3
+; RV64ZVBC32-NEXT:    add a1, sp, a1
+; RV64ZVBC32-NEXT:    addi a1, a1, 208
+; RV64ZVBC32-NEXT:    vl2r.v v16, (a1) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v14, v14, v16
+; RV64ZVBC32-NEXT:    csrr a1, vlenb
+; RV64ZVBC32-NEXT:    slli a1, a1, 3
+; RV64ZVBC32-NEXT:    mv a3, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 3
+; RV64ZVBC32-NEXT:    add a1, a1, a3
+; RV64ZVBC32-NEXT:    add a1, sp, a1
+; RV64ZVBC32-NEXT:    addi a1, a1, 208
+; RV64ZVBC32-NEXT:    vl2r.v v16, (a1) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v14, v14, v16
+; RV64ZVBC32-NEXT:    csrr a1, vlenb
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    mv a3, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 4
+; RV64ZVBC32-NEXT:    add a1, a1, a3
+; RV64ZVBC32-NEXT:    add a1, sp, a1
+; RV64ZVBC32-NEXT:    addi a1, a1, 208
+; RV64ZVBC32-NEXT:    vl2r.v v16, (a1) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v14, v14, v16
+; RV64ZVBC32-NEXT:    csrr a1, vlenb
+; RV64ZVBC32-NEXT:    slli a1, a1, 2
+; RV64ZVBC32-NEXT:    mv a3, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 4
+; RV64ZVBC32-NEXT:    add a1, a1, a3
+; RV64ZVBC32-NEXT:    add a1, sp, a1
+; RV64ZVBC32-NEXT:    addi a1, a1, 208
+; RV64ZVBC32-NEXT:    vl2r.v v16, (a1) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v14, v14, v16
+; RV64ZVBC32-NEXT:    csrr a1, vlenb
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    mv a3, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 5
+; RV64ZVBC32-NEXT:    add a1, a1, a3
+; RV64ZVBC32-NEXT:    add a1, sp, a1
+; RV64ZVBC32-NEXT:    addi a1, a1, 208
+; RV64ZVBC32-NEXT:    vl2r.v v16, (a1) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v14, v14, v16
+; RV64ZVBC32-NEXT:    csrr a1, vlenb
+; RV64ZVBC32-NEXT:    slli a1, a1, 6
+; RV64ZVBC32-NEXT:    add a1, sp, a1
+; RV64ZVBC32-NEXT:    addi a1, a1, 208
+; RV64ZVBC32-NEXT:    vl2r.v v16, (a1) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v14, v14, v16
+; RV64ZVBC32-NEXT:    csrr a1, vlenb
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    mv a3, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a3
+; RV64ZVBC32-NEXT:    add a1, sp, a1
+; RV64ZVBC32-NEXT:    addi a1, a1, 208
+; RV64ZVBC32-NEXT:    vl2r.v v16, (a1) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v14, v14, v16
+; RV64ZVBC32-NEXT:    csrr a1, vlenb
+; RV64ZVBC32-NEXT:    slli a1, a1, 2
+; RV64ZVBC32-NEXT:    mv a3, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a3
+; RV64ZVBC32-NEXT:    add a1, sp, a1
+; RV64ZVBC32-NEXT:    addi a1, a1, 208
+; RV64ZVBC32-NEXT:    vl2r.v v16, (a1) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v16, v14, v16
+; RV64ZVBC32-NEXT:    csrr a1, vlenb
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    mv a3, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 2
+; RV64ZVBC32-NEXT:    add a3, a3, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a3
+; RV64ZVBC32-NEXT:    add a1, sp, a1
+; RV64ZVBC32-NEXT:    addi a1, a1, 208
+; RV64ZVBC32-NEXT:    vl2r.v v18, (a1) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v18
+; RV64ZVBC32-NEXT:    csrr a1, vlenb
+; RV64ZVBC32-NEXT:    slli a1, a1, 3
+; RV64ZVBC32-NEXT:    mv a3, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a3
+; RV64ZVBC32-NEXT:    add a1, sp, a1
+; RV64ZVBC32-NEXT:    addi a1, a1, 208
+; RV64ZVBC32-NEXT:    vl2r.v v18, (a1) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v18
+; RV64ZVBC32-NEXT:    vsrl.vi v18, v12, 8
+; RV64ZVBC32-NEXT:    ld a3, 168(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v18, v18, a3
+; RV64ZVBC32-NEXT:    vsrl.vi v14, v14, 24
+; RV64ZVBC32-NEXT:    lui a1, 4080
+; RV64ZVBC32-NEXT:    vand.vx v14, v14, a1
+; RV64ZVBC32-NEXT:    vor.vv v14, v18, v14
+; RV64ZVBC32-NEXT:    csrr a5, vlenb
+; RV64ZVBC32-NEXT:    slli a5, a5, 1
+; RV64ZVBC32-NEXT:    mv a6, a5
+; RV64ZVBC32-NEXT:    slli a5, a5, 3
+; RV64ZVBC32-NEXT:    add a6, a6, a5
+; RV64ZVBC32-NEXT:    slli a5, a5, 1
+; RV64ZVBC32-NEXT:    add a6, a6, a5
+; RV64ZVBC32-NEXT:    slli a5, a5, 1
+; RV64ZVBC32-NEXT:    add a5, a5, a6
+; RV64ZVBC32-NEXT:    add a5, sp, a5
+; RV64ZVBC32-NEXT:    addi a5, a5, 208
+; RV64ZVBC32-NEXT:    vl2r.v v18, (a5) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v18
+; RV64ZVBC32-NEXT:    csrr a5, vlenb
+; RV64ZVBC32-NEXT:    slli a5, a5, 4
+; RV64ZVBC32-NEXT:    mv a6, a5
+; RV64ZVBC32-NEXT:    slli a5, a5, 1
+; RV64ZVBC32-NEXT:    add a6, a6, a5
+; RV64ZVBC32-NEXT:    slli a5, a5, 1
+; RV64ZVBC32-NEXT:    add a5, a5, a6
+; RV64ZVBC32-NEXT:    add a5, sp, a5
+; RV64ZVBC32-NEXT:    addi a5, a5, 208
+; RV64ZVBC32-NEXT:    vl2r.v v18, (a5) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v18
+; RV64ZVBC32-NEXT:    csrr a5, vlenb
+; RV64ZVBC32-NEXT:    slli a5, a5, 1
+; RV64ZVBC32-NEXT:    mv a6, a5
+; RV64ZVBC32-NEXT:    slli a5, a5, 1
+; RV64ZVBC32-NEXT:    add a6, a6, a5
+; RV64ZVBC32-NEXT:    slli a5, a5, 1
+; RV64ZVBC32-NEXT:    add a6, a6, a5
+; RV64ZVBC32-NEXT:    slli a5, a5, 2
+; RV64ZVBC32-NEXT:    add a6, a6, a5
+; RV64ZVBC32-NEXT:    slli a5, a5, 1
+; RV64ZVBC32-NEXT:    add a5, a5, a6
+; RV64ZVBC32-NEXT:    add a5, sp, a5
+; RV64ZVBC32-NEXT:    addi a5, a5, 208
+; RV64ZVBC32-NEXT:    vl2r.v v18, (a5) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v18
+; RV64ZVBC32-NEXT:    csrr a5, vlenb
+; RV64ZVBC32-NEXT:    slli a5, a5, 2
+; RV64ZVBC32-NEXT:    mv a6, a5
+; RV64ZVBC32-NEXT:    slli a5, a5, 1
+; RV64ZVBC32-NEXT:    add a6, a6, a5
+; RV64ZVBC32-NEXT:    slli a5, a5, 2
+; RV64ZVBC32-NEXT:    add a6, a6, a5
+; RV64ZVBC32-NEXT:    slli a5, a5, 1
+; RV64ZVBC32-NEXT:    add a5, a5, a6
+; RV64ZVBC32-NEXT:    add a5, sp, a5
+; RV64ZVBC32-NEXT:    addi a5, a5, 208
+; RV64ZVBC32-NEXT:    vl2r.v v18, (a5) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v18
+; RV64ZVBC32-NEXT:    csrr a5, vlenb
+; RV64ZVBC32-NEXT:    slli a5, a5, 1
+; RV64ZVBC32-NEXT:    mv a6, a5
+; RV64ZVBC32-NEXT:    slli a5, a5, 2
+; RV64ZVBC32-NEXT:    add a6, a6, a5
+; RV64ZVBC32-NEXT:    slli a5, a5, 2
+; RV64ZVBC32-NEXT:    add a6, a6, a5
+; RV64ZVBC32-NEXT:    slli a5, a5, 1
+; RV64ZVBC32-NEXT:    add a5, a5, a6
+; RV64ZVBC32-NEXT:    add a5, sp, a5
+; RV64ZVBC32-NEXT:    addi a5, a5, 208
+; RV64ZVBC32-NEXT:    vl2r.v v18, (a5) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v18
+; RV64ZVBC32-NEXT:    vand.vx v12, v12, a1
+; RV64ZVBC32-NEXT:    vsll.vi v12, v12, 24
+; RV64ZVBC32-NEXT:    csrr a1, vlenb
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    mv a5, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    add a5, a5, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 2
+; RV64ZVBC32-NEXT:    add a5, a5, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    add a5, a5, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a5
+; RV64ZVBC32-NEXT:    add a1, sp, a1
+; RV64ZVBC32-NEXT:    addi a1, a1, 208
+; RV64ZVBC32-NEXT:    vl2r.v v18, (a1) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v18, v16, v18
+; RV64ZVBC32-NEXT:    csrr a1, vlenb
+; RV64ZVBC32-NEXT:    slli a1, a1, 2
+; RV64ZVBC32-NEXT:    mv a5, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 2
+; RV64ZVBC32-NEXT:    add a5, a5, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    add a5, a5, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a5
+; RV64ZVBC32-NEXT:    add a1, sp, a1
+; RV64ZVBC32-NEXT:    addi a1, a1, 208
+; RV64ZVBC32-NEXT:    vl2r.v v20, (a1) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v18, v18, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v16, a3
+; RV64ZVBC32-NEXT:    vsll.vi v20, v20, 8
+; RV64ZVBC32-NEXT:    vor.vv v12, v12, v20
+; RV64ZVBC32-NEXT:    csrr a1, vlenb
+; RV64ZVBC32-NEXT:    slli a1, a1, 3
+; RV64ZVBC32-NEXT:    mv a3, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a3
+; RV64ZVBC32-NEXT:    add a1, sp, a1
+; RV64ZVBC32-NEXT:    addi a1, a1, 208
+; RV64ZVBC32-NEXT:    vl2r.v v20, (a1) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v18, v18, v20
+; RV64ZVBC32-NEXT:    vor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    csrr a1, vlenb
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    mv a3, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 2
+; RV64ZVBC32-NEXT:    add a3, a3, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a3
+; RV64ZVBC32-NEXT:    add a1, sp, a1
+; RV64ZVBC32-NEXT:    addi a1, a1, 208
+; RV64ZVBC32-NEXT:    vl2r.v v12, (a1) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v12, v18, v12
+; RV64ZVBC32-NEXT:    vsrl.vx v16, v16, a4
+; RV64ZVBC32-NEXT:    vand.vx v16, v16, a2
+; RV64ZVBC32-NEXT:    csrr a1, vlenb
+; RV64ZVBC32-NEXT:    slli a1, a1, 7
+; RV64ZVBC32-NEXT:    add a1, sp, a1
+; RV64ZVBC32-NEXT:    addi a1, a1, 208
+; RV64ZVBC32-NEXT:    vl2r.v v18, (a1) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v18
+; RV64ZVBC32-NEXT:    csrr a1, vlenb
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    mv a2, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    add a2, a2, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    add a2, a2, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    add a2, a2, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    add a2, a2, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a2
+; RV64ZVBC32-NEXT:    add a1, sp, a1
+; RV64ZVBC32-NEXT:    addi a1, a1, 208
+; RV64ZVBC32-NEXT:    vl2r.v v18, (a1) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v18
+; RV64ZVBC32-NEXT:    csrr a1, vlenb
+; RV64ZVBC32-NEXT:    slli a1, a1, 2
+; RV64ZVBC32-NEXT:    mv a2, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    add a2, a2, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    add a2, a2, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    add a2, a2, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a2
+; RV64ZVBC32-NEXT:    add a1, sp, a1
+; RV64ZVBC32-NEXT:    addi a1, a1, 208
+; RV64ZVBC32-NEXT:    vl2r.v v18, (a1) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v18
+; RV64ZVBC32-NEXT:    csrr a1, vlenb
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    mv a2, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    add a2, a2, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 2
+; RV64ZVBC32-NEXT:    add a2, a2, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a2
+; RV64ZVBC32-NEXT:    add a1, sp, a1
+; RV64ZVBC32-NEXT:    addi a1, a1, 208
+; RV64ZVBC32-NEXT:    vl2r.v v8, (a1) # vscale x 16-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v8, v12, v8
+; RV64ZVBC32-NEXT:    vsrl.vx v8, v8, a0
+; RV64ZVBC32-NEXT:    vor.vv v8, v16, v8
+; RV64ZVBC32-NEXT:    vor.vv v8, v14, v8
+; RV64ZVBC32-NEXT:    vor.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    vsrl.vi v10, v8, 4
+; RV64ZVBC32-NEXT:    ld a0, 176(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, a0
+; RV64ZVBC32-NEXT:    vand.vx v10, v10, a0
+; RV64ZVBC32-NEXT:    vsll.vi v8, v8, 4
+; RV64ZVBC32-NEXT:    vor.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    vsrl.vi v10, v8, 2
+; RV64ZVBC32-NEXT:    ld a0, 184(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, a0
+; RV64ZVBC32-NEXT:    vand.vx v10, v10, a0
+; RV64ZVBC32-NEXT:    vsll.vi v8, v8, 2
+; RV64ZVBC32-NEXT:    vor.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    vsrl.vi v10, v8, 1
+; RV64ZVBC32-NEXT:    ld a0, 192(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, a0
+; RV64ZVBC32-NEXT:    vand.vx v10, v10, a0
+; RV64ZVBC32-NEXT:    vadd.vv v8, v8, v8
+; RV64ZVBC32-NEXT:    vor.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    vsrl.vi v8, v8, 1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 6
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add sp, sp, a0
+; RV64ZVBC32-NEXT:    ld ra, 312(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    ld s0, 304(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    ld s1, 296(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    ld s2, 288(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    ld s3, 280(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    ld s4, 272(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    ld s5, 264(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    ld s6, 256(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    ld s7, 248(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    ld s8, 240(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    ld s9, 232(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    ld s10, 224(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    ld s11, 216(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    addi sp, sp, 320
+; RV64ZVBC32-NEXT:    ret
   %va.ext = zext <vscale x 2 x i64> %va to <vscale x 2 x i128>
   %vb.ext = zext <vscale x 2 x i64> %vb to <vscale x 2 x i128>
   %clmul = call <vscale x 2 x i128> @llvm.clmul.nxv2i128(<vscale x 2 x i128> %va.ext, <vscale x 2 x i128> %vb.ext)
@@ -22346,23 +34486,2905 @@ define <vscale x 2 x i64> @clmulh_nxv2i64_vx(<vscale x 2 x i64> %va, i64 %b) nou
 ; RV64V-NEXT:    vsrl.vi v8, v8, 1
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmulh_nxv2i64_vx:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    addi sp, sp, -16
-; RV32ZVBC-NEXT:    sw a0, 8(sp)
-; RV32ZVBC-NEXT:    sw a1, 12(sp)
-; RV32ZVBC-NEXT:    addi a0, sp, 8
-; RV32ZVBC-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32ZVBC-NEXT:    vlse64.v v10, (a0), zero
-; RV32ZVBC-NEXT:    vclmulh.vv v8, v8, v10
-; RV32ZVBC-NEXT:    addi sp, sp, 16
-; RV32ZVBC-NEXT:    ret
+; RV32ZVBC64-LABEL: clmulh_nxv2i64_vx:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    addi sp, sp, -16
+; RV32ZVBC64-NEXT:    sw a0, 8(sp)
+; RV32ZVBC64-NEXT:    sw a1, 12(sp)
+; RV32ZVBC64-NEXT:    addi a0, sp, 8
+; RV32ZVBC64-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32ZVBC64-NEXT:    vlse64.v v10, (a0), zero
+; RV32ZVBC64-NEXT:    vclmulh.vv v8, v8, v10
+; RV32ZVBC64-NEXT:    addi sp, sp, 16
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmulh_nxv2i64_vx:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV64ZVBC64-NEXT:    vclmulh.vx v8, v8, a0
+; RV64ZVBC64-NEXT:    ret
+;
+; RV32ZVBC32-LABEL: clmulh_nxv2i64_vx:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    addi sp, sp, -368
+; RV32ZVBC32-NEXT:    sw ra, 364(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s0, 360(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s1, 356(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s2, 352(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s3, 348(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s4, 344(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s5, 340(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s6, 336(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s7, 332(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s8, 328(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s9, 324(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s10, 320(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s11, 316(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a2, vlenb
+; RV32ZVBC32-NEXT:    slli a2, a2, 2
+; RV32ZVBC32-NEXT:    mv a3, a2
+; RV32ZVBC32-NEXT:    slli a2, a2, 2
+; RV32ZVBC32-NEXT:    add a3, a3, a2
+; RV32ZVBC32-NEXT:    slli a2, a2, 1
+; RV32ZVBC32-NEXT:    add a3, a3, a2
+; RV32ZVBC32-NEXT:    slli a2, a2, 1
+; RV32ZVBC32-NEXT:    add a2, a2, a3
+; RV32ZVBC32-NEXT:    sub sp, sp, a2
+; RV32ZVBC32-NEXT:    sw a0, 16(sp)
+; RV32ZVBC32-NEXT:    sw a1, 20(sp)
+; RV32ZVBC32-NEXT:    addi s7, sp, 16
+; RV32ZVBC32-NEXT:    lui s9, 1044480
+; RV32ZVBC32-NEXT:    li s1, 1
+; RV32ZVBC32-NEXT:    li ra, 2
+; RV32ZVBC32-NEXT:    li s3, 4
+; RV32ZVBC32-NEXT:    li s5, 8
+; RV32ZVBC32-NEXT:    li s6, 32
+; RV32ZVBC32-NEXT:    li s11, 64
+; RV32ZVBC32-NEXT:    li s4, 128
+; RV32ZVBC32-NEXT:    li s2, 256
+; RV32ZVBC32-NEXT:    li s0, 512
+; RV32ZVBC32-NEXT:    li t6, 1024
+; RV32ZVBC32-NEXT:    lui t5, 1
+; RV32ZVBC32-NEXT:    lui s8, 2
+; RV32ZVBC32-NEXT:    lui t4, 4
+; RV32ZVBC32-NEXT:    lui t2, 8
+; RV32ZVBC32-NEXT:    lui t1, 16
+; RV32ZVBC32-NEXT:    lui t0, 32
+; RV32ZVBC32-NEXT:    lui a7, 64
+; RV32ZVBC32-NEXT:    lui a6, 128
+; RV32ZVBC32-NEXT:    lui a5, 256
+; RV32ZVBC32-NEXT:    lui t3, 512
+; RV32ZVBC32-NEXT:    lui a4, 1024
+; RV32ZVBC32-NEXT:    lui a3, 2048
+; RV32ZVBC32-NEXT:    lui a2, 4096
+; RV32ZVBC32-NEXT:    lui a1, 8192
+; RV32ZVBC32-NEXT:    vsetvli s10, zero, e64, m2, ta, ma
+; RV32ZVBC32-NEXT:    vlse64.v v18, (s7), zero
+; RV32ZVBC32-NEXT:    lui s10, 16384
+; RV32ZVBC32-NEXT:    sw s9, 264(sp)
+; RV32ZVBC32-NEXT:    lui s9, 32768
+; RV32ZVBC32-NEXT:    sw zero, 268(sp)
+; RV32ZVBC32-NEXT:    lui a0, 524288
+; RV32ZVBC32-NEXT:    sw a0, 24(sp)
+; RV32ZVBC32-NEXT:    sw zero, 28(sp)
+; RV32ZVBC32-NEXT:    sw zero, 288(sp)
+; RV32ZVBC32-NEXT:    sw s1, 292(sp)
+; RV32ZVBC32-NEXT:    sw zero, 272(sp)
+; RV32ZVBC32-NEXT:    sw ra, 276(sp)
+; RV32ZVBC32-NEXT:    lui ra, 65536
+; RV32ZVBC32-NEXT:    sw zero, 280(sp)
+; RV32ZVBC32-NEXT:    sw s3, 284(sp)
+; RV32ZVBC32-NEXT:    lui s7, 131072
+; RV32ZVBC32-NEXT:    sw zero, 256(sp)
+; RV32ZVBC32-NEXT:    sw s5, 260(sp)
+; RV32ZVBC32-NEXT:    lui s3, 262144
+; RV32ZVBC32-NEXT:    sw zero, 248(sp)
+; RV32ZVBC32-NEXT:    li s5, 16
+; RV32ZVBC32-NEXT:    sw s5, 252(sp)
+; RV32ZVBC32-NEXT:    sw zero, 240(sp)
+; RV32ZVBC32-NEXT:    sw s6, 244(sp)
+; RV32ZVBC32-NEXT:    li s5, 32
+; RV32ZVBC32-NEXT:    sw zero, 232(sp)
+; RV32ZVBC32-NEXT:    sw s11, 236(sp)
+; RV32ZVBC32-NEXT:    sw zero, 224(sp)
+; RV32ZVBC32-NEXT:    sw s4, 228(sp)
+; RV32ZVBC32-NEXT:    sw zero, 216(sp)
+; RV32ZVBC32-NEXT:    sw s2, 220(sp)
+; RV32ZVBC32-NEXT:    sw zero, 208(sp)
+; RV32ZVBC32-NEXT:    sw s0, 212(sp)
+; RV32ZVBC32-NEXT:    sw zero, 200(sp)
+; RV32ZVBC32-NEXT:    sw t6, 204(sp)
+; RV32ZVBC32-NEXT:    slli s1, s1, 11
+; RV32ZVBC32-NEXT:    sw zero, 192(sp)
+; RV32ZVBC32-NEXT:    sw s1, 196(sp)
+; RV32ZVBC32-NEXT:    sw zero, 184(sp)
+; RV32ZVBC32-NEXT:    sw t5, 188(sp)
+; RV32ZVBC32-NEXT:    sw zero, 176(sp)
+; RV32ZVBC32-NEXT:    sw s8, 180(sp)
+; RV32ZVBC32-NEXT:    sw zero, 168(sp)
+; RV32ZVBC32-NEXT:    sw t4, 172(sp)
+; RV32ZVBC32-NEXT:    lui s4, 4
+; RV32ZVBC32-NEXT:    sw zero, 160(sp)
+; RV32ZVBC32-NEXT:    sw t2, 164(sp)
+; RV32ZVBC32-NEXT:    sw zero, 152(sp)
+; RV32ZVBC32-NEXT:    sw t1, 156(sp)
+; RV32ZVBC32-NEXT:    lui s0, 16
+; RV32ZVBC32-NEXT:    sw zero, 144(sp)
+; RV32ZVBC32-NEXT:    sw t0, 148(sp)
+; RV32ZVBC32-NEXT:    sw zero, 136(sp)
+; RV32ZVBC32-NEXT:    sw a7, 140(sp)
+; RV32ZVBC32-NEXT:    lui t5, 64
+; RV32ZVBC32-NEXT:    sw zero, 128(sp)
+; RV32ZVBC32-NEXT:    sw a6, 132(sp)
+; RV32ZVBC32-NEXT:    lui s2, 128
+; RV32ZVBC32-NEXT:    sw zero, 120(sp)
+; RV32ZVBC32-NEXT:    sw a5, 124(sp)
+; RV32ZVBC32-NEXT:    lui t6, 256
+; RV32ZVBC32-NEXT:    sw zero, 112(sp)
+; RV32ZVBC32-NEXT:    sw t3, 116(sp)
+; RV32ZVBC32-NEXT:    lui a7, 512
+; RV32ZVBC32-NEXT:    sw zero, 104(sp)
+; RV32ZVBC32-NEXT:    sw a4, 108(sp)
+; RV32ZVBC32-NEXT:    lui t4, 1024
+; RV32ZVBC32-NEXT:    sw zero, 96(sp)
+; RV32ZVBC32-NEXT:    sw a3, 100(sp)
+; RV32ZVBC32-NEXT:    lui a5, 2048
+; RV32ZVBC32-NEXT:    sw zero, 88(sp)
+; RV32ZVBC32-NEXT:    sw a2, 92(sp)
+; RV32ZVBC32-NEXT:    lui a6, 4096
+; RV32ZVBC32-NEXT:    sw zero, 80(sp)
+; RV32ZVBC32-NEXT:    sw a1, 84(sp)
+; RV32ZVBC32-NEXT:    lui a4, 8192
+; RV32ZVBC32-NEXT:    sw zero, 72(sp)
+; RV32ZVBC32-NEXT:    sw s10, 76(sp)
+; RV32ZVBC32-NEXT:    lui t1, 16384
+; RV32ZVBC32-NEXT:    sw zero, 64(sp)
+; RV32ZVBC32-NEXT:    sw s9, 68(sp)
+; RV32ZVBC32-NEXT:    sw zero, 56(sp)
+; RV32ZVBC32-NEXT:    sw ra, 60(sp)
+; RV32ZVBC32-NEXT:    sw zero, 48(sp)
+; RV32ZVBC32-NEXT:    sw s7, 52(sp)
+; RV32ZVBC32-NEXT:    sw zero, 40(sp)
+; RV32ZVBC32-NEXT:    sw s3, 44(sp)
+; RV32ZVBC32-NEXT:    sw zero, 32(sp)
+; RV32ZVBC32-NEXT:    sw a0, 36(sp)
+; RV32ZVBC32-NEXT:    lui t3, 61681
+; RV32ZVBC32-NEXT:    addi t3, t3, -241
+; RV32ZVBC32-NEXT:    vsetvli s8, zero, e32, m2, ta, ma
+; RV32ZVBC32-NEXT:    vmv.v.x v4, t3
+; RV32ZVBC32-NEXT:    lui t3, 209715
+; RV32ZVBC32-NEXT:    addi t3, t3, 819
+; RV32ZVBC32-NEXT:    vmv.v.x v2, t3
+; RV32ZVBC32-NEXT:    lui t3, 349525
+; RV32ZVBC32-NEXT:    addi t3, t3, 1365
+; RV32ZVBC32-NEXT:    vmv.v.x v0, t3
+; RV32ZVBC32-NEXT:    addi t3, sp, 264
+; RV32ZVBC32-NEXT:    vsetvli s8, zero, e64, m2, ta, ma
+; RV32ZVBC32-NEXT:    vlse64.v v6, (t3), zero
+; RV32ZVBC32-NEXT:    addi t3, sp, 24
+; RV32ZVBC32-NEXT:    vlse64.v v12, (t3), zero
+; RV32ZVBC32-NEXT:    addi t3, sp, 288
+; RV32ZVBC32-NEXT:    vlse64.v v14, (t3), zero
+; RV32ZVBC32-NEXT:    addi t3, sp, 272
+; RV32ZVBC32-NEXT:    vlse64.v v16, (t3), zero
+; RV32ZVBC32-NEXT:    li t3, 56
+; RV32ZVBC32-NEXT:    vsrl.vi v10, v8, 24
+; RV32ZVBC32-NEXT:    vsrl.vi v20, v8, 8
+; RV32ZVBC32-NEXT:    vsrl.vx v22, v8, t3
+; RV32ZVBC32-NEXT:    li s8, 40
+; RV32ZVBC32-NEXT:    vsrl.vx v24, v8, s8
+; RV32ZVBC32-NEXT:    lui s11, 4080
+; RV32ZVBC32-NEXT:    vand.vx v28, v10, s11
+; RV32ZVBC32-NEXT:    vsll.vx v10, v8, t3
+; RV32ZVBC32-NEXT:    addi s10, s0, -256
+; RV32ZVBC32-NEXT:    vand.vx v24, v24, s10
+; RV32ZVBC32-NEXT:    vand.vx v26, v8, s10
+; RV32ZVBC32-NEXT:    vor.vv v22, v24, v22
+; RV32ZVBC32-NEXT:    vsll.vx v24, v26, s8
+; RV32ZVBC32-NEXT:    vor.vv v10, v10, v24
+; RV32ZVBC32-NEXT:    vsrl.vx v24, v18, t3
+; RV32ZVBC32-NEXT:    vsrl.vx v26, v18, s8
+; RV32ZVBC32-NEXT:    vsll.vx v30, v18, t3
+; RV32ZVBC32-NEXT:    vand.vx v26, v26, s10
+; RV32ZVBC32-NEXT:    vor.vv v24, v26, v24
+; RV32ZVBC32-NEXT:    vand.vx v26, v18, s10
+; RV32ZVBC32-NEXT:    vsll.vx v26, v26, s8
+; RV32ZVBC32-NEXT:    vor.vv v26, v30, v26
+; RV32ZVBC32-NEXT:    vsrl.vi v30, v18, 24
+; RV32ZVBC32-NEXT:    vand.vv v20, v20, v6
+; RV32ZVBC32-NEXT:    vor.vv v28, v20, v28
+; RV32ZVBC32-NEXT:    vsrl.vi v20, v18, 8
+; RV32ZVBC32-NEXT:    vand.vx v30, v30, s11
+; RV32ZVBC32-NEXT:    vand.vv v20, v20, v6
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v6, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vor.vv v30, v20, v30
+; RV32ZVBC32-NEXT:    addi a3, sp, 280
+; RV32ZVBC32-NEXT:    vlse64.v v20, (a3), zero
+; RV32ZVBC32-NEXT:    vor.vv v28, v28, v22
+; RV32ZVBC32-NEXT:    vand.vx v22, v8, s11
+; RV32ZVBC32-NEXT:    vsll.vi v22, v22, 24
+; RV32ZVBC32-NEXT:    vand.vv v8, v8, v6
+; RV32ZVBC32-NEXT:    vsll.vi v8, v8, 8
+; RV32ZVBC32-NEXT:    vor.vv v8, v22, v8
+; RV32ZVBC32-NEXT:    addi a3, sp, 256
+; RV32ZVBC32-NEXT:    vlse64.v v22, (a3), zero
+; RV32ZVBC32-NEXT:    vor.vv v30, v30, v24
+; RV32ZVBC32-NEXT:    vand.vx v24, v18, s11
+; RV32ZVBC32-NEXT:    vsll.vi v24, v24, 24
+; RV32ZVBC32-NEXT:    vand.vv v18, v18, v6
+; RV32ZVBC32-NEXT:    vsll.vi v18, v18, 8
+; RV32ZVBC32-NEXT:    vor.vv v6, v24, v18
+; RV32ZVBC32-NEXT:    addi a3, sp, 248
+; RV32ZVBC32-NEXT:    vlse64.v v18, (a3), zero
+; RV32ZVBC32-NEXT:    vor.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    addi a3, sp, 240
+; RV32ZVBC32-NEXT:    vlse64.v v24, (a3), zero
+; RV32ZVBC32-NEXT:    vor.vv v10, v26, v6
+; RV32ZVBC32-NEXT:    addi a3, sp, 232
+; RV32ZVBC32-NEXT:    vlse64.v v26, (a3), zero
+; RV32ZVBC32-NEXT:    vor.vv v8, v8, v28
+; RV32ZVBC32-NEXT:    addi a3, sp, 224
+; RV32ZVBC32-NEXT:    vlse64.v v28, (a3), zero
+; RV32ZVBC32-NEXT:    vor.vv v10, v10, v30
+; RV32ZVBC32-NEXT:    vsrl.vi v30, v8, 4
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v4, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v8, v8, v4
+; RV32ZVBC32-NEXT:    vand.vv v30, v30, v4
+; RV32ZVBC32-NEXT:    vsll.vi v8, v8, 4
+; RV32ZVBC32-NEXT:    vor.vv v8, v30, v8
+; RV32ZVBC32-NEXT:    vsrl.vi v30, v10, 4
+; RV32ZVBC32-NEXT:    vand.vv v10, v10, v4
+; RV32ZVBC32-NEXT:    vand.vv v30, v30, v4
+; RV32ZVBC32-NEXT:    vsll.vi v10, v10, 4
+; RV32ZVBC32-NEXT:    vor.vv v10, v30, v10
+; RV32ZVBC32-NEXT:    vsrl.vi v30, v8, 2
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v2, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v8, v8, v2
+; RV32ZVBC32-NEXT:    vand.vv v30, v30, v2
+; RV32ZVBC32-NEXT:    vsll.vi v8, v8, 2
+; RV32ZVBC32-NEXT:    vor.vv v8, v30, v8
+; RV32ZVBC32-NEXT:    vsrl.vi v30, v10, 2
+; RV32ZVBC32-NEXT:    vand.vv v10, v10, v2
+; RV32ZVBC32-NEXT:    vand.vv v30, v30, v2
+; RV32ZVBC32-NEXT:    vsll.vi v10, v10, 2
+; RV32ZVBC32-NEXT:    vor.vv v30, v30, v10
+; RV32ZVBC32-NEXT:    vsrl.vi v10, v8, 1
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v0, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v8, v8, v0
+; RV32ZVBC32-NEXT:    vand.vv v10, v10, v0
+; RV32ZVBC32-NEXT:    vadd.vv v8, v8, v8
+; RV32ZVBC32-NEXT:    vor.vv v10, v10, v8
+; RV32ZVBC32-NEXT:    vsrl.vi v8, v30, 1
+; RV32ZVBC32-NEXT:    vand.vv v30, v30, v0
+; RV32ZVBC32-NEXT:    vand.vv v8, v8, v0
+; RV32ZVBC32-NEXT:    vadd.vv v30, v30, v30
+; RV32ZVBC32-NEXT:    vor.vv v8, v8, v30
+; RV32ZVBC32-NEXT:    addi a3, sp, 216
+; RV32ZVBC32-NEXT:    vlse64.v v30, (a3), zero
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v14
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v20
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v22
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v18
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v26
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v28
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi a3, sp, 208
+; RV32ZVBC32-NEXT:    addi a1, sp, 200
+; RV32ZVBC32-NEXT:    addi a0, sp, 192
+; RV32ZVBC32-NEXT:    vlse64.v v12, (a3), zero
+; RV32ZVBC32-NEXT:    vlse64.v v14, (a1), zero
+; RV32ZVBC32-NEXT:    vlse64.v v16, (a0), zero
+; RV32ZVBC32-NEXT:    vand.vv v18, v8, v30
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v18, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v14
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi a2, sp, 184
+; RV32ZVBC32-NEXT:    addi a1, sp, 176
+; RV32ZVBC32-NEXT:    addi a3, sp, 168
+; RV32ZVBC32-NEXT:    addi a0, sp, 160
+; RV32ZVBC32-NEXT:    vlse64.v v12, (a2), zero
+; RV32ZVBC32-NEXT:    vlse64.v v14, (a1), zero
+; RV32ZVBC32-NEXT:    vlse64.v v16, (a3), zero
+; RV32ZVBC32-NEXT:    vlse64.v v18, (a0), zero
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v14
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v18
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi a0, sp, 152
+; RV32ZVBC32-NEXT:    addi a1, sp, 144
+; RV32ZVBC32-NEXT:    addi a2, sp, 136
+; RV32ZVBC32-NEXT:    addi a3, sp, 128
+; RV32ZVBC32-NEXT:    vlse64.v v12, (a0), zero
+; RV32ZVBC32-NEXT:    vlse64.v v14, (a1), zero
+; RV32ZVBC32-NEXT:    vlse64.v v16, (a2), zero
+; RV32ZVBC32-NEXT:    vlse64.v v18, (a3), zero
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v14
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v18
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi a0, sp, 120
+; RV32ZVBC32-NEXT:    addi a1, sp, 112
+; RV32ZVBC32-NEXT:    addi a2, sp, 104
+; RV32ZVBC32-NEXT:    addi a3, sp, 96
+; RV32ZVBC32-NEXT:    vlse64.v v12, (a0), zero
+; RV32ZVBC32-NEXT:    vlse64.v v14, (a1), zero
+; RV32ZVBC32-NEXT:    vlse64.v v16, (a2), zero
+; RV32ZVBC32-NEXT:    vlse64.v v18, (a3), zero
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v14
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v18
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi a0, sp, 88
+; RV32ZVBC32-NEXT:    addi a1, sp, 80
+; RV32ZVBC32-NEXT:    addi a2, sp, 72
+; RV32ZVBC32-NEXT:    addi a3, sp, 64
+; RV32ZVBC32-NEXT:    vlse64.v v12, (a0), zero
+; RV32ZVBC32-NEXT:    vlse64.v v14, (a1), zero
+; RV32ZVBC32-NEXT:    vlse64.v v16, (a2), zero
+; RV32ZVBC32-NEXT:    vlse64.v v18, (a3), zero
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v14
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v18
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi a0, sp, 56
+; RV32ZVBC32-NEXT:    addi a1, sp, 48
+; RV32ZVBC32-NEXT:    addi a2, sp, 40
+; RV32ZVBC32-NEXT:    addi a3, sp, 32
+; RV32ZVBC32-NEXT:    vlse64.v v12, (a0), zero
+; RV32ZVBC32-NEXT:    vlse64.v v14, (a1), zero
+; RV32ZVBC32-NEXT:    vlse64.v v16, (a2), zero
+; RV32ZVBC32-NEXT:    vlse64.v v18, (a3), zero
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v14
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v18
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    li a0, 16
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, a0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v18, v8, s5
+; RV32ZVBC32-NEXT:    li a0, 64
+; RV32ZVBC32-NEXT:    vand.vx v20, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 128
+; RV32ZVBC32-NEXT:    vand.vx v22, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 256
+; RV32ZVBC32-NEXT:    vand.vx v24, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 512
+; RV32ZVBC32-NEXT:    vand.vx v26, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 1024
+; RV32ZVBC32-NEXT:    vand.vx v28, v8, a0
+; RV32ZVBC32-NEXT:    vand.vx v30, v8, s1
+; RV32ZVBC32-NEXT:    lui a0, 1
+; RV32ZVBC32-NEXT:    vand.vx v6, v8, a0
+; RV32ZVBC32-NEXT:    lui a0, 2
+; RV32ZVBC32-NEXT:    vand.vx v4, v8, a0
+; RV32ZVBC32-NEXT:    vand.vx v2, v8, s4
+; RV32ZVBC32-NEXT:    vand.vx v0, v8, t2
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, s0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, t0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, t5
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, s2
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, t6
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, a7
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, t4
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, a5
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, a6
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, a4
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, t1
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, s9
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, ra
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, s7
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, s3
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vi v12, v8, 2
+; RV32ZVBC32-NEXT:    vand.vi v14, v8, 1
+; RV32ZVBC32-NEXT:    vand.vi v16, v8, 4
+; RV32ZVBC32-NEXT:    vand.vi v8, v8, 8
+; RV32ZVBC32-NEXT:    vmul.vv v12, v10, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    vmul.vv v12, v10, v14
+; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v16
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v16, v10, v8
+; RV32ZVBC32-NEXT:    vmul.vv v18, v10, v18
+; RV32ZVBC32-NEXT:    vmul.vv v20, v10, v20
+; RV32ZVBC32-NEXT:    vmul.vv v22, v10, v22
+; RV32ZVBC32-NEXT:    vmul.vv v24, v10, v24
+; RV32ZVBC32-NEXT:    vmul.vv v26, v10, v26
+; RV32ZVBC32-NEXT:    vmul.vv v28, v10, v28
+; RV32ZVBC32-NEXT:    vmul.vv v30, v10, v30
+; RV32ZVBC32-NEXT:    vmul.vv v6, v10, v6
+; RV32ZVBC32-NEXT:    vmul.vv v4, v10, v4
+; RV32ZVBC32-NEXT:    vmul.vv v2, v10, v2
+; RV32ZVBC32-NEXT:    vmul.vv v0, v10, v0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    addi a0, sp, 304
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v10, v10, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v12, v8
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v12
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v18
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v20
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v22
+; RV32ZVBC32-NEXT:    vxor.vv v12, v8, v24
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v26
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v28
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v30
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v6
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v4
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v2
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v14, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v14, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v14, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v14, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v14, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v14, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v14, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v14, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v14, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v14, v12, v14
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v16, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v14, v14, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v16, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v14, v14, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v16, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v14, v14, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v16, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v14, v14, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v16, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v14, v14, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v16, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v14, v14, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v16, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v14, v14, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v16, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v14, v14, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v16, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v14, v14, v16
+; RV32ZVBC32-NEXT:    addi a0, sp, 304
+; RV32ZVBC32-NEXT:    vl2r.v v16, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v14, v14, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v16, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v14, v14, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v16, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v14, v14, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v16, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v14, v14, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v16, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v14, v14, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v16, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v14, v14, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v16, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v14, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v18, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v18
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v18, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v18
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v18, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v18
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v18, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v18
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v18, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v18
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v18, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v18
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v18, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v18
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v18, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v18, v16, v18
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v20, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v18, v18, v20
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v20, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v18, v18, v20
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v20, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v18, v18, v20
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v20, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v18, v18, v20
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v20, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v18, v18, v20
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v20, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v18, v18, v20
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v20, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v18, v18, v20
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v20, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v20, v18, v20
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v22, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v20, v20, v22
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v22, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v20, v20, v22
+; RV32ZVBC32-NEXT:    vand.vx v22, v14, s11
+; RV32ZVBC32-NEXT:    vsrl.vi v16, v16, 24
+; RV32ZVBC32-NEXT:    vand.vx v16, v16, s11
+; RV32ZVBC32-NEXT:    vand.vx v12, v12, s10
+; RV32ZVBC32-NEXT:    vsll.vx v12, v12, s8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v24, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v20, v20, v24
+; RV32ZVBC32-NEXT:    vsrl.vx v24, v18, s8
+; RV32ZVBC32-NEXT:    vand.vx v24, v24, s10
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v26, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v20, v20, v26
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v26, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v20, v20, v26
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v26, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v20, v20, v26
+; RV32ZVBC32-NEXT:    vxor.vv v10, v20, v10
+; RV32ZVBC32-NEXT:    vsll.vx v8, v8, t3
+; RV32ZVBC32-NEXT:    vsrl.vx v10, v10, t3
+; RV32ZVBC32-NEXT:    vor.vv v8, v8, v12
+; RV32ZVBC32-NEXT:    vsrl.vi v12, v14, 8
+; RV32ZVBC32-NEXT:    vsll.vi v14, v22, 24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v20, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v12, v12, v20
+; RV32ZVBC32-NEXT:    vor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    vand.vv v16, v18, v20
+; RV32ZVBC32-NEXT:    vsll.vi v16, v16, 8
+; RV32ZVBC32-NEXT:    vor.vv v14, v14, v16
+; RV32ZVBC32-NEXT:    vor.vv v8, v8, v14
+; RV32ZVBC32-NEXT:    vor.vv v10, v24, v10
+; RV32ZVBC32-NEXT:    vor.vv v10, v12, v10
+; RV32ZVBC32-NEXT:    vor.vv v8, v8, v10
+; RV32ZVBC32-NEXT:    vsrl.vi v10, v8, 4
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v8, v8, v12
+; RV32ZVBC32-NEXT:    vand.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    vsll.vi v8, v8, 4
+; RV32ZVBC32-NEXT:    vor.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    vsrl.vi v10, v8, 2
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v8, v8, v12
+; RV32ZVBC32-NEXT:    vand.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    vsll.vi v8, v8, 2
+; RV32ZVBC32-NEXT:    vor.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    vsrl.vi v10, v8, 1
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v8, v8, v12
+; RV32ZVBC32-NEXT:    vand.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    vadd.vv v8, v8, v8
+; RV32ZVBC32-NEXT:    vor.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    vsrl.vi v8, v8, 1
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add sp, sp, a0
+; RV32ZVBC32-NEXT:    lw ra, 364(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s0, 360(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s1, 356(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s2, 352(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s3, 348(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s4, 344(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s5, 340(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s6, 336(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s7, 332(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s8, 328(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s9, 324(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s10, 320(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s11, 316(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    addi sp, sp, 368
+; RV32ZVBC32-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmulh_nxv2i64_vx:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV64ZVBC-NEXT:    vclmulh.vx v8, v8, a0
-; RV64ZVBC-NEXT:    ret
+; RV64ZVBC32-LABEL: clmulh_nxv2i64_vx:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    li a1, 56
+; RV64ZVBC32-NEXT:    lui t2, 16
+; RV64ZVBC32-NEXT:    lui a2, 4080
+; RV64ZVBC32-NEXT:    li t0, 255
+; RV64ZVBC32-NEXT:    lui a3, 61681
+; RV64ZVBC32-NEXT:    lui a4, 209715
+; RV64ZVBC32-NEXT:    lui a5, 349525
+; RV64ZVBC32-NEXT:    srli a6, a0, 24
+; RV64ZVBC32-NEXT:    srli a7, a0, 8
+; RV64ZVBC32-NEXT:    srli t1, a0, 40
+; RV64ZVBC32-NEXT:    srli t3, a0, 56
+; RV64ZVBC32-NEXT:    addi a3, a3, -241
+; RV64ZVBC32-NEXT:    addi a4, a4, 819
+; RV64ZVBC32-NEXT:    addi t4, a5, 1365
+; RV64ZVBC32-NEXT:    slli a5, a3, 32
+; RV64ZVBC32-NEXT:    add a5, a3, a5
+; RV64ZVBC32-NEXT:    slli a3, a4, 32
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, t4, 32
+; RV64ZVBC32-NEXT:    add a3, t4, a3
+; RV64ZVBC32-NEXT:    srliw t4, a0, 24
+; RV64ZVBC32-NEXT:    slli t0, t0, 24
+; RV64ZVBC32-NEXT:    and a6, a6, a2
+; RV64ZVBC32-NEXT:    and a7, a7, t0
+; RV64ZVBC32-NEXT:    or t5, a7, a6
+; RV64ZVBC32-NEXT:    addi a6, t2, -256
+; RV64ZVBC32-NEXT:    and a7, t1, a6
+; RV64ZVBC32-NEXT:    or t1, a7, t3
+; RV64ZVBC32-NEXT:    and a7, a0, a2
+; RV64ZVBC32-NEXT:    slli t4, t4, 32
+; RV64ZVBC32-NEXT:    slli a7, a7, 24
+; RV64ZVBC32-NEXT:    or t3, a7, t4
+; RV64ZVBC32-NEXT:    li a7, 40
+; RV64ZVBC32-NEXT:    vsetvli t4, zero, e64, m2, ta, ma
+; RV64ZVBC32-NEXT:    vsrl.vi v12, v8, 24
+; RV64ZVBC32-NEXT:    vsrl.vi v10, v8, 8
+; RV64ZVBC32-NEXT:    or t1, t5, t1
+; RV64ZVBC32-NEXT:    slli t4, a0, 56
+; RV64ZVBC32-NEXT:    and a0, a0, a6
+; RV64ZVBC32-NEXT:    slli a0, a0, 40
+; RV64ZVBC32-NEXT:    or t4, t4, a0
+; RV64ZVBC32-NEXT:    li a0, 1
+; RV64ZVBC32-NEXT:    or t4, t4, t3
+; RV64ZVBC32-NEXT:    lui t3, 1
+; RV64ZVBC32-NEXT:    vsrl.vx v14, v8, a1
+; RV64ZVBC32-NEXT:    vsrl.vx v16, v8, a7
+; RV64ZVBC32-NEXT:    vand.vx v12, v12, a2
+; RV64ZVBC32-NEXT:    vand.vx v18, v8, a2
+; RV64ZVBC32-NEXT:    vsll.vx v20, v8, a1
+; RV64ZVBC32-NEXT:    vand.vx v16, v16, a6
+; RV64ZVBC32-NEXT:    vand.vx v10, v10, t0
+; RV64ZVBC32-NEXT:    vsll.vi v18, v18, 24
+; RV64ZVBC32-NEXT:    vor.vv v14, v16, v14
+; RV64ZVBC32-NEXT:    vand.vx v16, v8, t0
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, a6
+; RV64ZVBC32-NEXT:    vor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vsll.vi v12, v16, 8
+; RV64ZVBC32-NEXT:    vsll.vx v8, v8, a7
+; RV64ZVBC32-NEXT:    vor.vv v10, v10, v14
+; RV64ZVBC32-NEXT:    vor.vv v12, v18, v12
+; RV64ZVBC32-NEXT:    vor.vv v8, v20, v8
+; RV64ZVBC32-NEXT:    vor.vv v8, v8, v12
+; RV64ZVBC32-NEXT:    vor.vv v8, v8, v10
+; RV64ZVBC32-NEXT:    or t1, t4, t1
+; RV64ZVBC32-NEXT:    vsrl.vi v10, v8, 4
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, a5
+; RV64ZVBC32-NEXT:    srli t4, t1, 4
+; RV64ZVBC32-NEXT:    and t1, t1, a5
+; RV64ZVBC32-NEXT:    vand.vx v10, v10, a5
+; RV64ZVBC32-NEXT:    vsll.vi v8, v8, 4
+; RV64ZVBC32-NEXT:    and t4, t4, a5
+; RV64ZVBC32-NEXT:    slli t1, t1, 4
+; RV64ZVBC32-NEXT:    vor.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    or t1, t4, t1
+; RV64ZVBC32-NEXT:    vsrl.vi v10, v8, 2
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, a4
+; RV64ZVBC32-NEXT:    srli t4, t1, 2
+; RV64ZVBC32-NEXT:    and t1, t1, a4
+; RV64ZVBC32-NEXT:    vand.vx v10, v10, a4
+; RV64ZVBC32-NEXT:    vsll.vi v8, v8, 2
+; RV64ZVBC32-NEXT:    and t4, t4, a4
+; RV64ZVBC32-NEXT:    slli t1, t1, 2
+; RV64ZVBC32-NEXT:    vor.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    or t1, t4, t1
+; RV64ZVBC32-NEXT:    vsrl.vi v10, v8, 1
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, a3
+; RV64ZVBC32-NEXT:    srli t4, t1, 1
+; RV64ZVBC32-NEXT:    and t1, t1, a3
+; RV64ZVBC32-NEXT:    vand.vx v10, v10, a3
+; RV64ZVBC32-NEXT:    vadd.vv v8, v8, v8
+; RV64ZVBC32-NEXT:    and t4, t4, a3
+; RV64ZVBC32-NEXT:    slli t1, t1, 1
+; RV64ZVBC32-NEXT:    vor.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    or t1, t4, t1
+; RV64ZVBC32-NEXT:    andi t4, t1, 2
+; RV64ZVBC32-NEXT:    vmul.vx v10, v8, t4
+; RV64ZVBC32-NEXT:    andi t4, t1, 1
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, t4
+; RV64ZVBC32-NEXT:    andi t4, t1, 4
+; RV64ZVBC32-NEXT:    vmul.vx v14, v8, t4
+; RV64ZVBC32-NEXT:    andi t4, t1, 8
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, t4
+; RV64ZVBC32-NEXT:    andi t4, t1, 16
+; RV64ZVBC32-NEXT:    vmul.vx v18, v8, t4
+; RV64ZVBC32-NEXT:    andi t4, t1, 32
+; RV64ZVBC32-NEXT:    vmul.vx v20, v8, t4
+; RV64ZVBC32-NEXT:    andi t4, t1, 64
+; RV64ZVBC32-NEXT:    vxor.vv v10, v12, v10
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, t4
+; RV64ZVBC32-NEXT:    andi t4, t1, 128
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v14
+; RV64ZVBC32-NEXT:    vmul.vx v14, v8, t4
+; RV64ZVBC32-NEXT:    andi t4, t1, 256
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v16
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, t4
+; RV64ZVBC32-NEXT:    andi t4, t1, 512
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v18
+; RV64ZVBC32-NEXT:    vmul.vx v18, v8, t4
+; RV64ZVBC32-NEXT:    andi t4, t1, 1024
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v20
+; RV64ZVBC32-NEXT:    vmul.vx v20, v8, t4
+; RV64ZVBC32-NEXT:    slli t4, a0, 11
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, t4
+; RV64ZVBC32-NEXT:    lui t4, 2
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v14
+; RV64ZVBC32-NEXT:    vmul.vx v14, v8, t3
+; RV64ZVBC32-NEXT:    lui t3, 4
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v16, v10, v16
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v18
+; RV64ZVBC32-NEXT:    vmul.vx v18, v8, t4
+; RV64ZVBC32-NEXT:    lui t4, 8
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vmul.vx v20, v8, t3
+; RV64ZVBC32-NEXT:    lui t3, 32
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v12, v16, v12
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, t4
+; RV64ZVBC32-NEXT:    lui t4, 64
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vmul.vx v14, v8, t2
+; RV64ZVBC32-NEXT:    lui t2, 128
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v18
+; RV64ZVBC32-NEXT:    vmul.vx v18, v8, t3
+; RV64ZVBC32-NEXT:    lui t3, 256
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v20
+; RV64ZVBC32-NEXT:    vmul.vx v20, v8, t4
+; RV64ZVBC32-NEXT:    lui t4, 512
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, t2
+; RV64ZVBC32-NEXT:    lui t2, 1024
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vmul.vx v14, v8, t3
+; RV64ZVBC32-NEXT:    lui t3, 2048
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v18
+; RV64ZVBC32-NEXT:    vmul.vx v18, v8, t4
+; RV64ZVBC32-NEXT:    lui t4, 4096
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v20
+; RV64ZVBC32-NEXT:    vmul.vx v20, v8, t2
+; RV64ZVBC32-NEXT:    lui t2, 8192
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, t3
+; RV64ZVBC32-NEXT:    lui t3, 16384
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vmul.vx v22, v8, t4
+; RV64ZVBC32-NEXT:    lui t4, 32768
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v18
+; RV64ZVBC32-NEXT:    vmul.vx v18, v8, t2
+; RV64ZVBC32-NEXT:    lui t2, 65536
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vxor.vv v14, v12, v20
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, t3
+; RV64ZVBC32-NEXT:    lui t3, 131072
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v20, v14, v16
+; RV64ZVBC32-NEXT:    vmul.vx v14, v8, t4
+; RV64ZVBC32-NEXT:    lui t4, 262144
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v20, v22
+; RV64ZVBC32-NEXT:    vxor.vv v18, v16, v18
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, t2
+; RV64ZVBC32-NEXT:    slli t2, a0, 32
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vsll.vx v10, v10, a1
+; RV64ZVBC32-NEXT:    vand.vx v20, v20, a6
+; RV64ZVBC32-NEXT:    vsll.vx v20, v20, a7
+; RV64ZVBC32-NEXT:    vor.vv v10, v10, v20
+; RV64ZVBC32-NEXT:    vmul.vx v20, v8, t3
+; RV64ZVBC32-NEXT:    slli t3, a0, 33
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v12, v18, v12
+; RV64ZVBC32-NEXT:    vmul.vx v18, v8, t4
+; RV64ZVBC32-NEXT:    slli t4, a0, 34
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vmul.vx v14, v8, t2
+; RV64ZVBC32-NEXT:    slli t2, a0, 35
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, t3
+; RV64ZVBC32-NEXT:    slli t3, a0, 36
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v20, v12, v20
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, t4
+; RV64ZVBC32-NEXT:    srliw t4, t1, 31
+; RV64ZVBC32-NEXT:    slli t4, t4, 31
+; RV64ZVBC32-NEXT:    vxor.vv v18, v20, v18
+; RV64ZVBC32-NEXT:    vmul.vx v20, v8, t4
+; RV64ZVBC32-NEXT:    slli t4, a0, 37
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vxor.vv v18, v18, v20
+; RV64ZVBC32-NEXT:    vmul.vx v20, v8, t2
+; RV64ZVBC32-NEXT:    slli t2, a0, 38
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vxor.vv v14, v18, v14
+; RV64ZVBC32-NEXT:    vmul.vx v18, v8, t3
+; RV64ZVBC32-NEXT:    slli t3, a0, 39
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v14, v14, v16
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, t4
+; RV64ZVBC32-NEXT:    slli t4, a0, 40
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vxor.vv v12, v14, v12
+; RV64ZVBC32-NEXT:    vmul.vx v14, v8, t2
+; RV64ZVBC32-NEXT:    slli t2, a0, 41
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v20
+; RV64ZVBC32-NEXT:    vmul.vx v20, v8, t3
+; RV64ZVBC32-NEXT:    slli t3, a0, 42
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v18
+; RV64ZVBC32-NEXT:    vmul.vx v18, v8, t4
+; RV64ZVBC32-NEXT:    slli t4, a0, 43
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, t2
+; RV64ZVBC32-NEXT:    slli t2, a0, 44
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vmul.vx v14, v8, t3
+; RV64ZVBC32-NEXT:    slli t3, a0, 45
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v20
+; RV64ZVBC32-NEXT:    vmul.vx v20, v8, t4
+; RV64ZVBC32-NEXT:    slli t4, a0, 46
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vxor.vv v18, v12, v18
+; RV64ZVBC32-NEXT:    vxor.vv v16, v18, v16
+; RV64ZVBC32-NEXT:    vmul.vx v18, v8, t2
+; RV64ZVBC32-NEXT:    slli t2, a0, 47
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vxor.vv v14, v16, v14
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, t3
+; RV64ZVBC32-NEXT:    slli t3, a0, 48
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v14, v14, v20
+; RV64ZVBC32-NEXT:    vmul.vx v20, v8, t4
+; RV64ZVBC32-NEXT:    slli t4, a0, 49
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vxor.vv v14, v14, v18
+; RV64ZVBC32-NEXT:    vmul.vx v18, v8, t2
+; RV64ZVBC32-NEXT:    slli t2, a0, 50
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vxor.vv v14, v14, v16
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, t3
+; RV64ZVBC32-NEXT:    slli t3, a0, 51
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v14, v14, v20
+; RV64ZVBC32-NEXT:    vmul.vx v20, v8, t4
+; RV64ZVBC32-NEXT:    slli t4, a0, 52
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vxor.vv v14, v14, v18
+; RV64ZVBC32-NEXT:    vmul.vx v18, v8, t2
+; RV64ZVBC32-NEXT:    slli t2, a0, 53
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vxor.vv v16, v14, v16
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vmul.vx v20, v8, t3
+; RV64ZVBC32-NEXT:    slli t3, a0, 54
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v18
+; RV64ZVBC32-NEXT:    vsrl.vi v18, v12, 8
+; RV64ZVBC32-NEXT:    vand.vx v18, v18, t0
+; RV64ZVBC32-NEXT:    vsrl.vi v14, v14, 24
+; RV64ZVBC32-NEXT:    vand.vx v14, v14, a2
+; RV64ZVBC32-NEXT:    vor.vv v14, v18, v14
+; RV64ZVBC32-NEXT:    vmul.vx v18, v8, t4
+; RV64ZVBC32-NEXT:    slli t4, a0, 55
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vmul.vx v20, v8, t2
+; RV64ZVBC32-NEXT:    slli t2, a0, 56
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v18
+; RV64ZVBC32-NEXT:    vmul.vx v18, v8, t3
+; RV64ZVBC32-NEXT:    slli t3, a0, 57
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vmul.vx v20, v8, t4
+; RV64ZVBC32-NEXT:    slli t4, a0, 58
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v18
+; RV64ZVBC32-NEXT:    vmul.vx v18, v8, t2
+; RV64ZVBC32-NEXT:    slli t2, a0, 59
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vmul.vx v20, v8, t3
+; RV64ZVBC32-NEXT:    slli t3, a0, 60
+; RV64ZVBC32-NEXT:    vand.vx v12, v12, a2
+; RV64ZVBC32-NEXT:    slli a2, a0, 61
+; RV64ZVBC32-NEXT:    slli a0, a0, 62
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    and a2, t1, a2
+; RV64ZVBC32-NEXT:    and a0, t1, a0
+; RV64ZVBC32-NEXT:    srli t1, t1, 63
+; RV64ZVBC32-NEXT:    vsll.vi v12, v12, 24
+; RV64ZVBC32-NEXT:    vxor.vv v18, v16, v18
+; RV64ZVBC32-NEXT:    vxor.vv v18, v18, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v16, t0
+; RV64ZVBC32-NEXT:    vsll.vi v20, v20, 8
+; RV64ZVBC32-NEXT:    vor.vv v12, v12, v20
+; RV64ZVBC32-NEXT:    vmul.vx v20, v8, t4
+; RV64ZVBC32-NEXT:    vxor.vv v18, v18, v20
+; RV64ZVBC32-NEXT:    vmul.vx v20, v8, t2
+; RV64ZVBC32-NEXT:    vor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, t3
+; RV64ZVBC32-NEXT:    vxor.vv v18, v18, v20
+; RV64ZVBC32-NEXT:    vmul.vx v20, v8, a2
+; RV64ZVBC32-NEXT:    vxor.vv v12, v18, v12
+; RV64ZVBC32-NEXT:    vmul.vx v18, v8, a0
+; RV64ZVBC32-NEXT:    slli t1, t1, 63
+; RV64ZVBC32-NEXT:    vmul.vx v8, v8, t1
+; RV64ZVBC32-NEXT:    vsrl.vx v16, v16, a7
+; RV64ZVBC32-NEXT:    vand.vx v16, v16, a6
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v18
+; RV64ZVBC32-NEXT:    vxor.vv v8, v12, v8
+; RV64ZVBC32-NEXT:    vsrl.vx v8, v8, a1
+; RV64ZVBC32-NEXT:    vor.vv v8, v16, v8
+; RV64ZVBC32-NEXT:    vor.vv v8, v14, v8
+; RV64ZVBC32-NEXT:    vor.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    vsrl.vi v10, v8, 4
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, a5
+; RV64ZVBC32-NEXT:    vand.vx v10, v10, a5
+; RV64ZVBC32-NEXT:    vsll.vi v8, v8, 4
+; RV64ZVBC32-NEXT:    vor.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    vsrl.vi v10, v8, 2
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, a4
+; RV64ZVBC32-NEXT:    vand.vx v10, v10, a4
+; RV64ZVBC32-NEXT:    vsll.vi v8, v8, 2
+; RV64ZVBC32-NEXT:    vor.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    vsrl.vi v10, v8, 1
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, a3
+; RV64ZVBC32-NEXT:    vand.vx v10, v10, a3
+; RV64ZVBC32-NEXT:    vadd.vv v8, v8, v8
+; RV64ZVBC32-NEXT:    vor.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    vsrl.vi v8, v8, 1
+; RV64ZVBC32-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x i64> poison, i64 %b, i128 0
   %vb = shufflevector <vscale x 2 x i64> %elt.head, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
   %va.ext = zext <vscale x 2 x i64> %va to <vscale x 2 x i128>
@@ -28258,17 +43280,5901 @@ define <vscale x 4 x i64> @clmulh_nxv4i64_vv(<vscale x 4 x i64> %va, <vscale x 4
 ; RV64V-NEXT:    addi sp, sp, 384
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmulh_nxv4i64_vv:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV32ZVBC-NEXT:    vclmulh.vv v8, v8, v12
-; RV32ZVBC-NEXT:    ret
+; RV32ZVBC64-LABEL: clmulh_nxv4i64_vv:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV32ZVBC64-NEXT:    vclmulh.vv v8, v8, v12
+; RV32ZVBC64-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmulh_nxv4i64_vv:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV64ZVBC-NEXT:    vclmulh.vv v8, v8, v12
-; RV64ZVBC-NEXT:    ret
+; RV64ZVBC64-LABEL: clmulh_nxv4i64_vv:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV64ZVBC64-NEXT:    vclmulh.vv v8, v8, v12
+; RV64ZVBC64-NEXT:    ret
+;
+; RV32ZVBC32-LABEL: clmulh_nxv4i64_vv:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    addi sp, sp, -352
+; RV32ZVBC32-NEXT:    sw ra, 348(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s0, 344(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s1, 340(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s2, 336(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s3, 332(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s4, 328(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s5, 324(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s6, 320(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s7, 316(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s8, 312(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s9, 308(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s10, 304(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s11, 300(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    sub sp, sp, a0
+; RV32ZVBC32-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV32ZVBC32-NEXT:    vmv4r.v v28, v12
+; RV32ZVBC32-NEXT:    lui s11, 1044480
+; RV32ZVBC32-NEXT:    lui t6, 524288
+; RV32ZVBC32-NEXT:    li a0, 1
+; RV32ZVBC32-NEXT:    li ra, 2
+; RV32ZVBC32-NEXT:    li t4, 4
+; RV32ZVBC32-NEXT:    li t2, 8
+; RV32ZVBC32-NEXT:    li t5, 16
+; RV32ZVBC32-NEXT:    li t3, 32
+; RV32ZVBC32-NEXT:    li t1, 64
+; RV32ZVBC32-NEXT:    li t0, 128
+; RV32ZVBC32-NEXT:    li a7, 256
+; RV32ZVBC32-NEXT:    li a6, 512
+; RV32ZVBC32-NEXT:    li a3, 1024
+; RV32ZVBC32-NEXT:    lui a2, 1
+; RV32ZVBC32-NEXT:    lui a4, 2
+; RV32ZVBC32-NEXT:    lui a1, 4
+; RV32ZVBC32-NEXT:    lui a5, 8
+; RV32ZVBC32-NEXT:    lui s0, 16
+; RV32ZVBC32-NEXT:    lui s1, 32
+; RV32ZVBC32-NEXT:    lui s2, 64
+; RV32ZVBC32-NEXT:    lui s3, 128
+; RV32ZVBC32-NEXT:    lui s4, 256
+; RV32ZVBC32-NEXT:    lui s5, 512
+; RV32ZVBC32-NEXT:    lui s6, 1024
+; RV32ZVBC32-NEXT:    lui s7, 2048
+; RV32ZVBC32-NEXT:    lui s8, 4096
+; RV32ZVBC32-NEXT:    lui s9, 8192
+; RV32ZVBC32-NEXT:    lui s10, 16384
+; RV32ZVBC32-NEXT:    sw s11, 248(sp)
+; RV32ZVBC32-NEXT:    lui s11, 32768
+; RV32ZVBC32-NEXT:    sw zero, 252(sp)
+; RV32ZVBC32-NEXT:    sw t6, 8(sp)
+; RV32ZVBC32-NEXT:    sw zero, 12(sp)
+; RV32ZVBC32-NEXT:    sw zero, 272(sp)
+; RV32ZVBC32-NEXT:    sw a0, 276(sp)
+; RV32ZVBC32-NEXT:    sw zero, 256(sp)
+; RV32ZVBC32-NEXT:    sw ra, 260(sp)
+; RV32ZVBC32-NEXT:    lui ra, 65536
+; RV32ZVBC32-NEXT:    sw zero, 264(sp)
+; RV32ZVBC32-NEXT:    sw t4, 268(sp)
+; RV32ZVBC32-NEXT:    lui t4, 131072
+; RV32ZVBC32-NEXT:    sw zero, 240(sp)
+; RV32ZVBC32-NEXT:    sw t2, 244(sp)
+; RV32ZVBC32-NEXT:    lui t2, 262144
+; RV32ZVBC32-NEXT:    sw zero, 232(sp)
+; RV32ZVBC32-NEXT:    sw t5, 236(sp)
+; RV32ZVBC32-NEXT:    sw zero, 224(sp)
+; RV32ZVBC32-NEXT:    sw t3, 228(sp)
+; RV32ZVBC32-NEXT:    sw zero, 216(sp)
+; RV32ZVBC32-NEXT:    sw t1, 220(sp)
+; RV32ZVBC32-NEXT:    sw zero, 208(sp)
+; RV32ZVBC32-NEXT:    sw t0, 212(sp)
+; RV32ZVBC32-NEXT:    sw zero, 200(sp)
+; RV32ZVBC32-NEXT:    sw a7, 204(sp)
+; RV32ZVBC32-NEXT:    sw zero, 192(sp)
+; RV32ZVBC32-NEXT:    sw a6, 196(sp)
+; RV32ZVBC32-NEXT:    sw zero, 184(sp)
+; RV32ZVBC32-NEXT:    sw a3, 188(sp)
+; RV32ZVBC32-NEXT:    slli a3, a0, 11
+; RV32ZVBC32-NEXT:    sw zero, 176(sp)
+; RV32ZVBC32-NEXT:    sw a3, 180(sp)
+; RV32ZVBC32-NEXT:    sw zero, 168(sp)
+; RV32ZVBC32-NEXT:    sw a2, 172(sp)
+; RV32ZVBC32-NEXT:    lui t1, 1
+; RV32ZVBC32-NEXT:    sw zero, 160(sp)
+; RV32ZVBC32-NEXT:    sw a4, 164(sp)
+; RV32ZVBC32-NEXT:    lui t3, 2
+; RV32ZVBC32-NEXT:    sw zero, 152(sp)
+; RV32ZVBC32-NEXT:    sw a1, 156(sp)
+; RV32ZVBC32-NEXT:    sw zero, 144(sp)
+; RV32ZVBC32-NEXT:    sw a5, 148(sp)
+; RV32ZVBC32-NEXT:    lui t5, 8
+; RV32ZVBC32-NEXT:    sw zero, 136(sp)
+; RV32ZVBC32-NEXT:    sw s0, 140(sp)
+; RV32ZVBC32-NEXT:    sw zero, 128(sp)
+; RV32ZVBC32-NEXT:    sw s1, 132(sp)
+; RV32ZVBC32-NEXT:    sw zero, 120(sp)
+; RV32ZVBC32-NEXT:    sw s2, 124(sp)
+; RV32ZVBC32-NEXT:    sw zero, 112(sp)
+; RV32ZVBC32-NEXT:    sw s3, 116(sp)
+; RV32ZVBC32-NEXT:    sw zero, 104(sp)
+; RV32ZVBC32-NEXT:    sw s4, 108(sp)
+; RV32ZVBC32-NEXT:    sw zero, 96(sp)
+; RV32ZVBC32-NEXT:    sw s5, 100(sp)
+; RV32ZVBC32-NEXT:    sw zero, 88(sp)
+; RV32ZVBC32-NEXT:    sw s6, 92(sp)
+; RV32ZVBC32-NEXT:    sw zero, 80(sp)
+; RV32ZVBC32-NEXT:    sw s7, 84(sp)
+; RV32ZVBC32-NEXT:    sw zero, 72(sp)
+; RV32ZVBC32-NEXT:    sw s8, 76(sp)
+; RV32ZVBC32-NEXT:    sw zero, 64(sp)
+; RV32ZVBC32-NEXT:    sw s9, 68(sp)
+; RV32ZVBC32-NEXT:    sw zero, 56(sp)
+; RV32ZVBC32-NEXT:    sw s10, 60(sp)
+; RV32ZVBC32-NEXT:    sw zero, 48(sp)
+; RV32ZVBC32-NEXT:    sw s11, 52(sp)
+; RV32ZVBC32-NEXT:    sw zero, 40(sp)
+; RV32ZVBC32-NEXT:    sw ra, 44(sp)
+; RV32ZVBC32-NEXT:    sw zero, 32(sp)
+; RV32ZVBC32-NEXT:    sw t4, 36(sp)
+; RV32ZVBC32-NEXT:    sw zero, 24(sp)
+; RV32ZVBC32-NEXT:    sw t2, 28(sp)
+; RV32ZVBC32-NEXT:    sw zero, 16(sp)
+; RV32ZVBC32-NEXT:    sw t6, 20(sp)
+; RV32ZVBC32-NEXT:    addi a1, sp, 248
+; RV32ZVBC32-NEXT:    vlse64.v v12, (a1), zero
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    li a6, 56
+; RV32ZVBC32-NEXT:    vsrl.vx v20, v8, a6
+; RV32ZVBC32-NEXT:    li a5, 40
+; RV32ZVBC32-NEXT:    vsrl.vx v24, v8, a5
+; RV32ZVBC32-NEXT:    vsll.vx v16, v8, a6
+; RV32ZVBC32-NEXT:    vsrl.vx v12, v28, a6
+; RV32ZVBC32-NEXT:    vsrl.vx v4, v28, a5
+; RV32ZVBC32-NEXT:    addi a2, s0, -256
+; RV32ZVBC32-NEXT:    vand.vx v24, v24, a2
+; RV32ZVBC32-NEXT:    vor.vv v24, v24, v20
+; RV32ZVBC32-NEXT:    vsll.vx v0, v28, a6
+; RV32ZVBC32-NEXT:    vand.vx v20, v4, a2
+; RV32ZVBC32-NEXT:    vor.vv v12, v20, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v20, v8, a2
+; RV32ZVBC32-NEXT:    vsll.vx v20, v20, a5
+; RV32ZVBC32-NEXT:    vor.vv v20, v16, v20
+; RV32ZVBC32-NEXT:    vand.vx v16, v28, a2
+; RV32ZVBC32-NEXT:    vsll.vx v16, v16, a5
+; RV32ZVBC32-NEXT:    vor.vv v12, v0, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vsrl.vi v4, v8, 24
+; RV32ZVBC32-NEXT:    lui a4, 4080
+; RV32ZVBC32-NEXT:    vand.vx v4, v4, a4
+; RV32ZVBC32-NEXT:    vsrl.vi v0, v8, 8
+; RV32ZVBC32-NEXT:    vmv4r.v v12, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v0, v0, v8
+; RV32ZVBC32-NEXT:    vor.vv v4, v0, v4
+; RV32ZVBC32-NEXT:    vsrl.vi v0, v28, 24
+; RV32ZVBC32-NEXT:    vand.vx v0, v0, a4
+; RV32ZVBC32-NEXT:    vsrl.vi v16, v28, 8
+; RV32ZVBC32-NEXT:    vand.vv v16, v16, v8
+; RV32ZVBC32-NEXT:    vor.vv v16, v16, v0
+; RV32ZVBC32-NEXT:    vor.vv v24, v4, v24
+; RV32ZVBC32-NEXT:    vand.vx v4, v12, a4
+; RV32ZVBC32-NEXT:    vsll.vi v4, v4, 24
+; RV32ZVBC32-NEXT:    vand.vv v12, v12, v8
+; RV32ZVBC32-NEXT:    vsll.vi v12, v12, 8
+; RV32ZVBC32-NEXT:    vor.vv v12, v4, v12
+; RV32ZVBC32-NEXT:    lui a7, 61681
+; RV32ZVBC32-NEXT:    addi a7, a7, -241
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v4, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vor.vv v16, v16, v4
+; RV32ZVBC32-NEXT:    vmv4r.v v4, v28
+; RV32ZVBC32-NEXT:    vand.vx v28, v28, a4
+; RV32ZVBC32-NEXT:    vsll.vi v28, v28, 24
+; RV32ZVBC32-NEXT:    vand.vv v4, v4, v8
+; RV32ZVBC32-NEXT:    vsll.vi v4, v4, 8
+; RV32ZVBC32-NEXT:    vor.vv v28, v28, v4
+; RV32ZVBC32-NEXT:    vsetvli t0, zero, e32, m4, ta, ma
+; RV32ZVBC32-NEXT:    vmv.v.x v4, a7
+; RV32ZVBC32-NEXT:    lui a7, 209715
+; RV32ZVBC32-NEXT:    addi a7, a7, 819
+; RV32ZVBC32-NEXT:    vsetvli t0, zero, e64, m4, ta, ma
+; RV32ZVBC32-NEXT:    vor.vv v12, v20, v12
+; RV32ZVBC32-NEXT:    vsetvli t0, zero, e32, m4, ta, ma
+; RV32ZVBC32-NEXT:    vmv.v.x v0, a7
+; RV32ZVBC32-NEXT:    lui a7, 349525
+; RV32ZVBC32-NEXT:    addi a7, a7, 1365
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vsetvli t0, zero, e64, m4, ta, ma
+; RV32ZVBC32-NEXT:    vor.vv v28, v8, v28
+; RV32ZVBC32-NEXT:    vsetvli t0, zero, e32, m4, ta, ma
+; RV32ZVBC32-NEXT:    vmv.v.x v8, a7
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vsetvli a7, zero, e64, m4, ta, ma
+; RV32ZVBC32-NEXT:    vor.vv v24, v12, v24
+; RV32ZVBC32-NEXT:    addi a7, sp, 8
+; RV32ZVBC32-NEXT:    vlse64.v v12, (a7), zero
+; RV32ZVBC32-NEXT:    vor.vv v8, v28, v16
+; RV32ZVBC32-NEXT:    vsrl.vi v16, v24, 4
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v4, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v20, v24, v4
+; RV32ZVBC32-NEXT:    vand.vv v16, v16, v4
+; RV32ZVBC32-NEXT:    vsll.vi v20, v20, 4
+; RV32ZVBC32-NEXT:    vor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    vsrl.vi v20, v8, 4
+; RV32ZVBC32-NEXT:    vand.vv v8, v8, v4
+; RV32ZVBC32-NEXT:    vand.vv v20, v20, v4
+; RV32ZVBC32-NEXT:    vsll.vi v8, v8, 4
+; RV32ZVBC32-NEXT:    vor.vv v8, v20, v8
+; RV32ZVBC32-NEXT:    vsrl.vi v20, v16, 2
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v0, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v16, v16, v0
+; RV32ZVBC32-NEXT:    vand.vv v20, v20, v0
+; RV32ZVBC32-NEXT:    vsll.vi v16, v16, 2
+; RV32ZVBC32-NEXT:    vor.vv v16, v20, v16
+; RV32ZVBC32-NEXT:    vsrl.vi v20, v8, 2
+; RV32ZVBC32-NEXT:    vand.vv v8, v8, v0
+; RV32ZVBC32-NEXT:    vand.vv v20, v20, v0
+; RV32ZVBC32-NEXT:    vsll.vi v8, v8, 2
+; RV32ZVBC32-NEXT:    vor.vv v8, v20, v8
+; RV32ZVBC32-NEXT:    vsrl.vi v20, v16, 1
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v28, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v16, v16, v28
+; RV32ZVBC32-NEXT:    vand.vv v20, v20, v28
+; RV32ZVBC32-NEXT:    vadd.vv v16, v16, v16
+; RV32ZVBC32-NEXT:    vor.vv v24, v20, v16
+; RV32ZVBC32-NEXT:    vsrl.vi v16, v8, 1
+; RV32ZVBC32-NEXT:    vand.vv v8, v8, v28
+; RV32ZVBC32-NEXT:    vand.vv v16, v16, v28
+; RV32ZVBC32-NEXT:    vadd.vv v8, v8, v8
+; RV32ZVBC32-NEXT:    vor.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    addi a7, sp, 272
+; RV32ZVBC32-NEXT:    vlse64.v v16, (a7), zero
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi a7, sp, 256
+; RV32ZVBC32-NEXT:    addi t0, sp, 264
+; RV32ZVBC32-NEXT:    addi a0, sp, 240
+; RV32ZVBC32-NEXT:    vlse64.v v12, (a7), zero
+; RV32ZVBC32-NEXT:    vlse64.v v20, (t0), zero
+; RV32ZVBC32-NEXT:    vlse64.v v28, (a0), zero
+; RV32ZVBC32-NEXT:    vand.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v16, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v20
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v28
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi a1, sp, 232
+; RV32ZVBC32-NEXT:    addi a7, sp, 224
+; RV32ZVBC32-NEXT:    addi t0, sp, 216
+; RV32ZVBC32-NEXT:    addi a0, sp, 208
+; RV32ZVBC32-NEXT:    vlse64.v v12, (a1), zero
+; RV32ZVBC32-NEXT:    vlse64.v v16, (a7), zero
+; RV32ZVBC32-NEXT:    vlse64.v v20, (t0), zero
+; RV32ZVBC32-NEXT:    vlse64.v v28, (a0), zero
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v20
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v28
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi a0, sp, 200
+; RV32ZVBC32-NEXT:    addi a1, sp, 192
+; RV32ZVBC32-NEXT:    addi a7, sp, 184
+; RV32ZVBC32-NEXT:    addi t0, sp, 176
+; RV32ZVBC32-NEXT:    vlse64.v v12, (a0), zero
+; RV32ZVBC32-NEXT:    vlse64.v v16, (a1), zero
+; RV32ZVBC32-NEXT:    vlse64.v v20, (a7), zero
+; RV32ZVBC32-NEXT:    vlse64.v v28, (t0), zero
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 8
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v20
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v28
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi a0, sp, 168
+; RV32ZVBC32-NEXT:    addi a1, sp, 160
+; RV32ZVBC32-NEXT:    addi a7, sp, 152
+; RV32ZVBC32-NEXT:    addi t0, sp, 144
+; RV32ZVBC32-NEXT:    vlse64.v v12, (a0), zero
+; RV32ZVBC32-NEXT:    vlse64.v v16, (a1), zero
+; RV32ZVBC32-NEXT:    vlse64.v v20, (a7), zero
+; RV32ZVBC32-NEXT:    vlse64.v v28, (t0), zero
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v20
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v28
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi a0, sp, 136
+; RV32ZVBC32-NEXT:    addi a1, sp, 128
+; RV32ZVBC32-NEXT:    addi a7, sp, 120
+; RV32ZVBC32-NEXT:    addi t0, sp, 112
+; RV32ZVBC32-NEXT:    vlse64.v v12, (a0), zero
+; RV32ZVBC32-NEXT:    vlse64.v v16, (a1), zero
+; RV32ZVBC32-NEXT:    vlse64.v v20, (a7), zero
+; RV32ZVBC32-NEXT:    vlse64.v v28, (t0), zero
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v20
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v28
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi a0, sp, 104
+; RV32ZVBC32-NEXT:    addi a1, sp, 96
+; RV32ZVBC32-NEXT:    addi a7, sp, 88
+; RV32ZVBC32-NEXT:    addi t0, sp, 80
+; RV32ZVBC32-NEXT:    vlse64.v v12, (a0), zero
+; RV32ZVBC32-NEXT:    vlse64.v v16, (a1), zero
+; RV32ZVBC32-NEXT:    vlse64.v v20, (a7), zero
+; RV32ZVBC32-NEXT:    vlse64.v v28, (t0), zero
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v20
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v28
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi a0, sp, 72
+; RV32ZVBC32-NEXT:    addi a1, sp, 64
+; RV32ZVBC32-NEXT:    addi a7, sp, 56
+; RV32ZVBC32-NEXT:    addi t0, sp, 48
+; RV32ZVBC32-NEXT:    vlse64.v v12, (a0), zero
+; RV32ZVBC32-NEXT:    vlse64.v v16, (a1), zero
+; RV32ZVBC32-NEXT:    vlse64.v v20, (a7), zero
+; RV32ZVBC32-NEXT:    vlse64.v v28, (t0), zero
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v20
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v28
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi a0, sp, 40
+; RV32ZVBC32-NEXT:    addi a1, sp, 32
+; RV32ZVBC32-NEXT:    addi a7, sp, 24
+; RV32ZVBC32-NEXT:    addi t0, sp, 16
+; RV32ZVBC32-NEXT:    vlse64.v v12, (a0), zero
+; RV32ZVBC32-NEXT:    vlse64.v v16, (a1), zero
+; RV32ZVBC32-NEXT:    vlse64.v v20, (a7), zero
+; RV32ZVBC32-NEXT:    vlse64.v v28, (t0), zero
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v20
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v28
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    li a0, 16
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, a0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    li a0, 32
+; RV32ZVBC32-NEXT:    vand.vx v28, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 64
+; RV32ZVBC32-NEXT:    vand.vx v4, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 128
+; RV32ZVBC32-NEXT:    vand.vx v0, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 256
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, a0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    li a0, 512
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, a0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    li a0, 1024
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, a0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, a3
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, t1
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, t3
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    lui a0, 4
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, a0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, t5
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, s0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 7
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, s1
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, s2
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, s3
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, s4
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, s5
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, s6
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, s7
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, s8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, s9
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, s10
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, s11
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, ra
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, t4
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, t2
+; RV32ZVBC32-NEXT:    addi a0, sp, 288
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vi v12, v8, 2
+; RV32ZVBC32-NEXT:    vand.vi v16, v8, 1
+; RV32ZVBC32-NEXT:    vand.vi v20, v8, 4
+; RV32ZVBC32-NEXT:    vand.vi v8, v8, 8
+; RV32ZVBC32-NEXT:    vmul.vv v12, v24, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vmul.vv v12, v24, v16
+; RV32ZVBC32-NEXT:    vmul.vv v16, v24, v20
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v20, v24, v8
+; RV32ZVBC32-NEXT:    vmul.vv v28, v24, v28
+; RV32ZVBC32-NEXT:    vmul.vv v4, v24, v4
+; RV32ZVBC32-NEXT:    vmul.vv v0, v24, v0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 7
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi a0, sp, 288
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 7
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 8
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 8
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v24, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v12, v8
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v12
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v20
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v28
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v4
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v20, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    vsll.vx v8, v8, a6
+; RV32ZVBC32-NEXT:    vand.vx v12, v12, a2
+; RV32ZVBC32-NEXT:    vsll.vx v12, v12, a5
+; RV32ZVBC32-NEXT:    vor.vv v8, v8, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v16, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 7
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v20, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v20, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v20, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v20, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v20, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v20, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v20, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v20, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v20, v16, v20
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v28, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v20, v20, v28
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v28, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v20, v20, v28
+; RV32ZVBC32-NEXT:    vsrl.vi v28, v12, 8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v4, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v28, v28, v4
+; RV32ZVBC32-NEXT:    vsrl.vi v16, v16, 24
+; RV32ZVBC32-NEXT:    vand.vx v16, v16, a4
+; RV32ZVBC32-NEXT:    vor.vv v16, v28, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v28, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v20, v20, v28
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v28, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v20, v20, v28
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v28, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v20, v20, v28
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v28, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v20, v20, v28
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v28, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v20, v20, v28
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 8
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v28, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v28, v20, v28
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v0, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v28, v28, v0
+; RV32ZVBC32-NEXT:    vand.vx v12, v12, a4
+; RV32ZVBC32-NEXT:    vsll.vi v12, v12, 24
+; RV32ZVBC32-NEXT:    vand.vv v4, v20, v4
+; RV32ZVBC32-NEXT:    vsll.vi v4, v4, 8
+; RV32ZVBC32-NEXT:    vor.vv v12, v12, v4
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v4, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v28, v28, v4
+; RV32ZVBC32-NEXT:    vor.vv v8, v8, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v28, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v28, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v28
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v28, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v28
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v28, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v28
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v24
+; RV32ZVBC32-NEXT:    vsrl.vx v20, v20, a5
+; RV32ZVBC32-NEXT:    vand.vx v20, v20, a2
+; RV32ZVBC32-NEXT:    vsrl.vx v12, v12, a6
+; RV32ZVBC32-NEXT:    vor.vv v12, v20, v12
+; RV32ZVBC32-NEXT:    vor.vv v12, v16, v12
+; RV32ZVBC32-NEXT:    vor.vv v8, v8, v12
+; RV32ZVBC32-NEXT:    vsrl.vi v12, v8, 4
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    vand.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    vsll.vi v8, v8, 4
+; RV32ZVBC32-NEXT:    vor.vv v8, v12, v8
+; RV32ZVBC32-NEXT:    vsrl.vi v12, v8, 2
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    vand.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    vsll.vi v8, v8, 2
+; RV32ZVBC32-NEXT:    vor.vv v8, v12, v8
+; RV32ZVBC32-NEXT:    vsrl.vi v12, v8, 1
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    vand.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    vadd.vv v8, v8, v8
+; RV32ZVBC32-NEXT:    vor.vv v8, v12, v8
+; RV32ZVBC32-NEXT:    vsrl.vi v8, v8, 1
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add sp, sp, a0
+; RV32ZVBC32-NEXT:    lw ra, 348(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s0, 344(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s1, 340(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s2, 336(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s3, 332(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s4, 328(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s5, 324(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s6, 320(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s7, 316(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s8, 312(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s9, 308(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s10, 304(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s11, 300(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    addi sp, sp, 352
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmulh_nxv4i64_vv:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    addi sp, sp, -384
+; RV64ZVBC32-NEXT:    sd ra, 376(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    sd s0, 368(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    sd s1, 360(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    sd s2, 352(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    sd s3, 344(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    sd s4, 336(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    sd s5, 328(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    sd s6, 320(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    sd s7, 312(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    sd s8, 304(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    sd s9, 296(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    sd s10, 288(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    sd s11, 280(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    sub sp, sp, a0
+; RV64ZVBC32-NEXT:    lui a5, 16
+; RV64ZVBC32-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV64ZVBC32-NEXT:    vsrl.vi v24, v8, 24
+; RV64ZVBC32-NEXT:    vsrl.vi v16, v8, 8
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v16, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    li s3, 255
+; RV64ZVBC32-NEXT:    lui a0, 61681
+; RV64ZVBC32-NEXT:    lui a1, 209715
+; RV64ZVBC32-NEXT:    lui a2, 349525
+; RV64ZVBC32-NEXT:    vsrl.vi v28, v12, 24
+; RV64ZVBC32-NEXT:    li a3, 56
+; RV64ZVBC32-NEXT:    vsrl.vx v16, v8, a3
+; RV64ZVBC32-NEXT:    li a4, 40
+; RV64ZVBC32-NEXT:    vsrl.vx v20, v8, a4
+; RV64ZVBC32-NEXT:    addi t3, a5, -256
+; RV64ZVBC32-NEXT:    vsrl.vx v4, v12, a3
+; RV64ZVBC32-NEXT:    vand.vx v20, v20, t3
+; RV64ZVBC32-NEXT:    vor.vv v20, v20, v16
+; RV64ZVBC32-NEXT:    vsrl.vx v16, v12, a4
+; RV64ZVBC32-NEXT:    li t4, 40
+; RV64ZVBC32-NEXT:    vand.vx v16, v16, t3
+; RV64ZVBC32-NEXT:    vor.vv v0, v16, v4
+; RV64ZVBC32-NEXT:    vsrl.vi v16, v12, 8
+; RV64ZVBC32-NEXT:    li a3, 16
+; RV64ZVBC32-NEXT:    li a6, 32
+; RV64ZVBC32-NEXT:    li a5, 64
+; RV64ZVBC32-NEXT:    li a4, 128
+; RV64ZVBC32-NEXT:    li t0, 256
+; RV64ZVBC32-NEXT:    li a7, 512
+; RV64ZVBC32-NEXT:    li t2, 1
+; RV64ZVBC32-NEXT:    lui t1, 4080
+; RV64ZVBC32-NEXT:    vand.vx v4, v24, t1
+; RV64ZVBC32-NEXT:    slli s3, s3, 24
+; RV64ZVBC32-NEXT:    vand.vx v24, v28, t1
+; RV64ZVBC32-NEXT:    vand.vx v16, v16, s3
+; RV64ZVBC32-NEXT:    vor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vand.vx v24, v12, t1
+; RV64ZVBC32-NEXT:    lui t5, 4080
+; RV64ZVBC32-NEXT:    vsll.vi v28, v24, 24
+; RV64ZVBC32-NEXT:    vor.vv v24, v16, v0
+; RV64ZVBC32-NEXT:    vand.vx v16, v12, s3
+; RV64ZVBC32-NEXT:    vsll.vi v16, v16, 8
+; RV64ZVBC32-NEXT:    vor.vv v16, v28, v16
+; RV64ZVBC32-NEXT:    li t1, 56
+; RV64ZVBC32-NEXT:    vsll.vx v28, v12, t1
+; RV64ZVBC32-NEXT:    vand.vx v12, v12, t3
+; RV64ZVBC32-NEXT:    mv s0, t3
+; RV64ZVBC32-NEXT:    sd t3, 224(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vsll.vx v12, v12, t4
+; RV64ZVBC32-NEXT:    vor.vv v12, v28, v12
+; RV64ZVBC32-NEXT:    vand.vx v0, v8, t5
+; RV64ZVBC32-NEXT:    vor.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    vsll.vx v28, v8, t1
+; RV64ZVBC32-NEXT:    addi t3, a0, -241
+; RV64ZVBC32-NEXT:    addi t5, a1, 819
+; RV64ZVBC32-NEXT:    addi t6, a2, 1365
+; RV64ZVBC32-NEXT:    slli a0, t2, 11
+; RV64ZVBC32-NEXT:    sd a0, 216(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli a0, t2, 31
+; RV64ZVBC32-NEXT:    sd a0, 208(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli a0, t2, 32
+; RV64ZVBC32-NEXT:    sd a0, 200(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli a0, t2, 33
+; RV64ZVBC32-NEXT:    sd a0, 192(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli a0, t2, 34
+; RV64ZVBC32-NEXT:    sd a0, 184(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli a0, t2, 35
+; RV64ZVBC32-NEXT:    sd a0, 176(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli a0, t2, 36
+; RV64ZVBC32-NEXT:    sd a0, 168(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli a0, t2, 37
+; RV64ZVBC32-NEXT:    sd a0, 160(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli a0, t2, 38
+; RV64ZVBC32-NEXT:    sd a0, 152(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli a0, t2, 39
+; RV64ZVBC32-NEXT:    sd a0, 144(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli a0, t2, 40
+; RV64ZVBC32-NEXT:    sd a0, 136(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli a0, t3, 32
+; RV64ZVBC32-NEXT:    add t3, t3, a0
+; RV64ZVBC32-NEXT:    slli a0, t5, 32
+; RV64ZVBC32-NEXT:    add t5, t5, a0
+; RV64ZVBC32-NEXT:    slli a0, t6, 32
+; RV64ZVBC32-NEXT:    add a0, t6, a0
+; RV64ZVBC32-NEXT:    slli a1, t2, 41
+; RV64ZVBC32-NEXT:    sd a1, 128(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vor.vv v12, v12, v24
+; RV64ZVBC32-NEXT:    vsrl.vi v16, v12, 4
+; RV64ZVBC32-NEXT:    vand.vx v12, v12, t3
+; RV64ZVBC32-NEXT:    vand.vx v16, v16, t3
+; RV64ZVBC32-NEXT:    vsll.vi v12, v12, 4
+; RV64ZVBC32-NEXT:    vor.vv v12, v16, v12
+; RV64ZVBC32-NEXT:    vsrl.vi v16, v12, 2
+; RV64ZVBC32-NEXT:    vand.vx v12, v12, t5
+; RV64ZVBC32-NEXT:    vand.vx v16, v16, t5
+; RV64ZVBC32-NEXT:    vsll.vi v12, v12, 2
+; RV64ZVBC32-NEXT:    vor.vv v12, v16, v12
+; RV64ZVBC32-NEXT:    vsrl.vi v16, v12, 1
+; RV64ZVBC32-NEXT:    vand.vx v12, v12, a0
+; RV64ZVBC32-NEXT:    vand.vx v16, v16, a0
+; RV64ZVBC32-NEXT:    vadd.vv v12, v12, v12
+; RV64ZVBC32-NEXT:    vor.vv v24, v16, v12
+; RV64ZVBC32-NEXT:    vand.vx v12, v24, a3
+; RV64ZVBC32-NEXT:    csrr a1, vlenb
+; RV64ZVBC32-NEXT:    slli a1, a1, 4
+; RV64ZVBC32-NEXT:    mv a2, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    add a2, a2, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 3
+; RV64ZVBC32-NEXT:    add a1, a1, a2
+; RV64ZVBC32-NEXT:    add a1, sp, a1
+; RV64ZVBC32-NEXT:    addi a1, a1, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a1) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli a1, t2, 42
+; RV64ZVBC32-NEXT:    sd a1, 120(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a1, vlenb
+; RV64ZVBC32-NEXT:    slli a1, a1, 2
+; RV64ZVBC32-NEXT:    mv a2, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 2
+; RV64ZVBC32-NEXT:    add a2, a2, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    add a2, a2, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 3
+; RV64ZVBC32-NEXT:    add a1, a1, a2
+; RV64ZVBC32-NEXT:    add a1, sp, a1
+; RV64ZVBC32-NEXT:    addi a1, a1, 272
+; RV64ZVBC32-NEXT:    vl4r.v v12, (a1) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    sd s3, 232(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v12, v12, s3
+; RV64ZVBC32-NEXT:    vor.vv v12, v12, v4
+; RV64ZVBC32-NEXT:    vand.vx v16, v24, a6
+; RV64ZVBC32-NEXT:    csrr a1, vlenb
+; RV64ZVBC32-NEXT:    slli a1, a1, 4
+; RV64ZVBC32-NEXT:    mv a2, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a2
+; RV64ZVBC32-NEXT:    add a1, sp, a1
+; RV64ZVBC32-NEXT:    addi a1, a1, 272
+; RV64ZVBC32-NEXT:    vs4r.v v16, (a1) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli a1, t2, 43
+; RV64ZVBC32-NEXT:    sd a1, 112(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vsll.vi v16, v0, 24
+; RV64ZVBC32-NEXT:    vor.vv v12, v12, v20
+; RV64ZVBC32-NEXT:    vand.vx v20, v8, s3
+; RV64ZVBC32-NEXT:    vsll.vi v20, v20, 8
+; RV64ZVBC32-NEXT:    vor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vand.vx v0, v24, a5
+; RV64ZVBC32-NEXT:    slli a1, t2, 44
+; RV64ZVBC32-NEXT:    sd a1, 104(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, s0
+; RV64ZVBC32-NEXT:    vsll.vx v8, v8, t4
+; RV64ZVBC32-NEXT:    vor.vv v8, v28, v8
+; RV64ZVBC32-NEXT:    vand.vx v20, v24, a4
+; RV64ZVBC32-NEXT:    csrr a1, vlenb
+; RV64ZVBC32-NEXT:    slli a1, a1, 2
+; RV64ZVBC32-NEXT:    mv a2, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 2
+; RV64ZVBC32-NEXT:    add a2, a2, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    add a2, a2, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 3
+; RV64ZVBC32-NEXT:    add a1, a1, a2
+; RV64ZVBC32-NEXT:    add a1, sp, a1
+; RV64ZVBC32-NEXT:    addi a1, a1, 272
+; RV64ZVBC32-NEXT:    vs4r.v v20, (a1) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli a1, t2, 45
+; RV64ZVBC32-NEXT:    sd a1, 96(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vor.vv v8, v8, v16
+; RV64ZVBC32-NEXT:    vand.vx v16, v24, t0
+; RV64ZVBC32-NEXT:    csrr a1, vlenb
+; RV64ZVBC32-NEXT:    slli a1, a1, 2
+; RV64ZVBC32-NEXT:    mv a2, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    add a2, a2, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 2
+; RV64ZVBC32-NEXT:    add a2, a2, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 3
+; RV64ZVBC32-NEXT:    add a1, a1, a2
+; RV64ZVBC32-NEXT:    add a1, sp, a1
+; RV64ZVBC32-NEXT:    addi a1, a1, 272
+; RV64ZVBC32-NEXT:    vs4r.v v16, (a1) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli a1, t2, 46
+; RV64ZVBC32-NEXT:    sd a1, 88(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vor.vv v8, v8, v12
+; RV64ZVBC32-NEXT:    vsrl.vi v12, v8, 4
+; RV64ZVBC32-NEXT:    sd t3, 240(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, t3
+; RV64ZVBC32-NEXT:    vand.vx v12, v12, t3
+; RV64ZVBC32-NEXT:    vsll.vi v8, v8, 4
+; RV64ZVBC32-NEXT:    vor.vv v8, v12, v8
+; RV64ZVBC32-NEXT:    vsrl.vi v12, v8, 2
+; RV64ZVBC32-NEXT:    sd t5, 248(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, t5
+; RV64ZVBC32-NEXT:    vand.vx v12, v12, t5
+; RV64ZVBC32-NEXT:    vsll.vi v8, v8, 2
+; RV64ZVBC32-NEXT:    vor.vv v8, v12, v8
+; RV64ZVBC32-NEXT:    vsrl.vi v12, v8, 1
+; RV64ZVBC32-NEXT:    sd a0, 256(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, a0
+; RV64ZVBC32-NEXT:    vand.vx v12, v12, a0
+; RV64ZVBC32-NEXT:    vadd.vv v8, v8, v8
+; RV64ZVBC32-NEXT:    vor.vv v8, v12, v8
+; RV64ZVBC32-NEXT:    vand.vx v12, v24, a7
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli a0, t2, 47
+; RV64ZVBC32-NEXT:    sd a0, 80(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli a0, t2, 48
+; RV64ZVBC32-NEXT:    sd a0, 72(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli a0, t2, 49
+; RV64ZVBC32-NEXT:    sd a0, 64(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli a0, t2, 50
+; RV64ZVBC32-NEXT:    sd a0, 56(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli a0, t2, 51
+; RV64ZVBC32-NEXT:    sd a0, 48(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli a0, t2, 52
+; RV64ZVBC32-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli a0, t2, 53
+; RV64ZVBC32-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli a0, t2, 54
+; RV64ZVBC32-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli ra, t2, 55
+; RV64ZVBC32-NEXT:    slli s11, t2, 56
+; RV64ZVBC32-NEXT:    slli s10, t2, 57
+; RV64ZVBC32-NEXT:    slli s9, t2, 58
+; RV64ZVBC32-NEXT:    slli s8, t2, 59
+; RV64ZVBC32-NEXT:    slli s6, t2, 60
+; RV64ZVBC32-NEXT:    slli s7, t2, 61
+; RV64ZVBC32-NEXT:    slli s5, t2, 62
+; RV64ZVBC32-NEXT:    li a0, -1
+; RV64ZVBC32-NEXT:    slli s4, a0, 63
+; RV64ZVBC32-NEXT:    li a0, 1024
+; RV64ZVBC32-NEXT:    lui a1, 1
+; RV64ZVBC32-NEXT:    lui a2, 2
+; RV64ZVBC32-NEXT:    lui a3, 4
+; RV64ZVBC32-NEXT:    lui a4, 8
+; RV64ZVBC32-NEXT:    lui a5, 32
+; RV64ZVBC32-NEXT:    lui a6, 64
+; RV64ZVBC32-NEXT:    lui a7, 128
+; RV64ZVBC32-NEXT:    lui t0, 256
+; RV64ZVBC32-NEXT:    lui t1, 512
+; RV64ZVBC32-NEXT:    lui t2, 1024
+; RV64ZVBC32-NEXT:    lui t3, 2048
+; RV64ZVBC32-NEXT:    lui t4, 4096
+; RV64ZVBC32-NEXT:    lui t5, 8192
+; RV64ZVBC32-NEXT:    lui t6, 16384
+; RV64ZVBC32-NEXT:    lui s0, 32768
+; RV64ZVBC32-NEXT:    lui s1, 65536
+; RV64ZVBC32-NEXT:    lui s2, 131072
+; RV64ZVBC32-NEXT:    lui s3, 262144
+; RV64ZVBC32-NEXT:    vand.vx v12, v24, a0
+; RV64ZVBC32-NEXT:    sd s8, 8(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv s8, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add s8, s8, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, s8
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld a0, 216(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v12, v24, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 5
+; RV64ZVBC32-NEXT:    mv s8, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, s8
+; RV64ZVBC32-NEXT:    ld s8, 8(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v12, v24, a1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v12, v24, a2
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v12, v24, a3
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v12, v24, a4
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    lui a0, 16
+; RV64ZVBC32-NEXT:    vand.vx v12, v24, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 5
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v12, v24, a5
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 5
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v12, v24, a6
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 6
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v12, v24, a7
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 8
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v12, v24, t0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v12, v24, t1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v12, v24, t2
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v12, v24, t3
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v12, v24, t4
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v12, v24, t5
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v12, v24, t6
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v12, v24, s0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 5
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v12, v24, s1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v12, v24, s2
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v12, v24, s3
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld a0, 208(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v12, v24, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld a0, 200(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v12, v24, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld a0, 192(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v12, v24, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld a0, 184(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v12, v24, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld a0, 176(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v12, v24, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 6
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld a0, 168(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v12, v24, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld a0, 160(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v12, v24, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld a0, 152(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v12, v24, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld a0, 144(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v12, v24, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld a0, 136(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v12, v24, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld a0, 128(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v12, v24, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld a0, 120(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v12, v24, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld a0, 112(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v12, v24, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 5
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld a0, 104(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v12, v24, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld a0, 96(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v12, v24, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld a0, 88(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v12, v24, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld a0, 80(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v12, v24, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld a0, 72(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v12, v24, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld a0, 64(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v12, v24, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld a0, 56(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v12, v24, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 5
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v12, v24, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 7
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v12, v24, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v12, v24, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v12, v24, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v12, v24, ra
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v12, v24, s11
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v12, v24, s10
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v12, v24, s9
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v12, v24, s8
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 5
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v12, v24, s6
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vi v12, v24, 2
+; RV64ZVBC32-NEXT:    vand.vi v16, v24, 1
+; RV64ZVBC32-NEXT:    vand.vi v20, v24, 4
+; RV64ZVBC32-NEXT:    vand.vi v28, v24, 8
+; RV64ZVBC32-NEXT:    vand.vx v4, v24, s7
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v4, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v4, v24, s5
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v4, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v24, v24, s4
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v24, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    vmul.vv v12, v8, v16
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    vmul.vv v16, v8, v20
+; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v28
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v24, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v24, v8, v24
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v28, v8, v12
+; RV64ZVBC32-NEXT:    vmul.vv v4, v8, v0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v0, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 6
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 5
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 5
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 5
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 5
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 6
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 8
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV64ZVBC32-NEXT:    addi a0, sp, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 6
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 5
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 8
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 6
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 5
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 5
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 6
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 5
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 7
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 5
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 5
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 5
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 5
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v12, v8, v12
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v8, v8, v12
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 5
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v12, v8, v12
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v20
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v24
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v28
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v4
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v16, v12, v8
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 6
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v8
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v8
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v8
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v8
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v8
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v8
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v8
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v8
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 5
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v8
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v8
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v8
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v8
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v8
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v8
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v8
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v20, v16, v8
+; RV64ZVBC32-NEXT:    addi a0, sp, 272
+; RV64ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v20, v20, v8
+; RV64ZVBC32-NEXT:    li a2, 56
+; RV64ZVBC32-NEXT:    vsll.vx v12, v12, a2
+; RV64ZVBC32-NEXT:    ld a1, 224(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v16, v16, a1
+; RV64ZVBC32-NEXT:    li a0, 40
+; RV64ZVBC32-NEXT:    vsll.vx v16, v16, a0
+; RV64ZVBC32-NEXT:    vor.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    csrr a3, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a3, 2
+; RV64ZVBC32-NEXT:    mv a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 6
+; RV64ZVBC32-NEXT:    add a3, a3, a4
+; RV64ZVBC32-NEXT:    add a3, sp, a3
+; RV64ZVBC32-NEXT:    addi a3, a3, 272
+; RV64ZVBC32-NEXT:    vl4r.v v16, (a3) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v16, v20, v16
+; RV64ZVBC32-NEXT:    csrr a3, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a3, 8
+; RV64ZVBC32-NEXT:    add a3, sp, a3
+; RV64ZVBC32-NEXT:    addi a3, a3, 272
+; RV64ZVBC32-NEXT:    vl4r.v v20, (a3) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    csrr a3, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a3, 2
+; RV64ZVBC32-NEXT:    mv a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a4
+; RV64ZVBC32-NEXT:    add a3, sp, a3
+; RV64ZVBC32-NEXT:    addi a3, a3, 272
+; RV64ZVBC32-NEXT:    vl4r.v v20, (a3) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    csrr a3, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a3, 3
+; RV64ZVBC32-NEXT:    mv a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a4
+; RV64ZVBC32-NEXT:    add a3, sp, a3
+; RV64ZVBC32-NEXT:    addi a3, a3, 272
+; RV64ZVBC32-NEXT:    vl4r.v v20, (a3) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    csrr a3, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a3, 2
+; RV64ZVBC32-NEXT:    mv a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 2
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a4
+; RV64ZVBC32-NEXT:    add a3, sp, a3
+; RV64ZVBC32-NEXT:    addi a3, a3, 272
+; RV64ZVBC32-NEXT:    vl4r.v v20, (a3) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    csrr a3, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a3, 4
+; RV64ZVBC32-NEXT:    mv a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a4
+; RV64ZVBC32-NEXT:    add a3, sp, a3
+; RV64ZVBC32-NEXT:    addi a3, a3, 272
+; RV64ZVBC32-NEXT:    vl4r.v v20, (a3) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    csrr a3, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a3, 2
+; RV64ZVBC32-NEXT:    mv a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 2
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a4
+; RV64ZVBC32-NEXT:    add a3, sp, a3
+; RV64ZVBC32-NEXT:    addi a3, a3, 272
+; RV64ZVBC32-NEXT:    vl4r.v v20, (a3) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    csrr a3, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a3, 3
+; RV64ZVBC32-NEXT:    mv a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 2
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a4
+; RV64ZVBC32-NEXT:    add a3, sp, a3
+; RV64ZVBC32-NEXT:    addi a3, a3, 272
+; RV64ZVBC32-NEXT:    vl4r.v v20, (a3) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    csrr a3, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a3, 2
+; RV64ZVBC32-NEXT:    mv a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 3
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a4
+; RV64ZVBC32-NEXT:    add a3, sp, a3
+; RV64ZVBC32-NEXT:    addi a3, a3, 272
+; RV64ZVBC32-NEXT:    vl4r.v v20, (a3) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    csrr a3, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a3, 5
+; RV64ZVBC32-NEXT:    mv a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a4
+; RV64ZVBC32-NEXT:    add a3, sp, a3
+; RV64ZVBC32-NEXT:    addi a3, a3, 272
+; RV64ZVBC32-NEXT:    vl4r.v v20, (a3) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    csrr a3, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a3, 2
+; RV64ZVBC32-NEXT:    mv a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 2
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a4
+; RV64ZVBC32-NEXT:    add a3, sp, a3
+; RV64ZVBC32-NEXT:    addi a3, a3, 272
+; RV64ZVBC32-NEXT:    vl4r.v v20, (a3) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    csrr a3, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a3, 3
+; RV64ZVBC32-NEXT:    mv a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 2
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a4
+; RV64ZVBC32-NEXT:    add a3, sp, a3
+; RV64ZVBC32-NEXT:    addi a3, a3, 272
+; RV64ZVBC32-NEXT:    vl4r.v v20, (a3) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    csrr a3, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a3, 2
+; RV64ZVBC32-NEXT:    mv a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 2
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 2
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a4
+; RV64ZVBC32-NEXT:    add a3, sp, a3
+; RV64ZVBC32-NEXT:    addi a3, a3, 272
+; RV64ZVBC32-NEXT:    vl4r.v v20, (a3) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    csrr a3, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a3, 4
+; RV64ZVBC32-NEXT:    mv a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 2
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a4
+; RV64ZVBC32-NEXT:    add a3, sp, a3
+; RV64ZVBC32-NEXT:    addi a3, a3, 272
+; RV64ZVBC32-NEXT:    vl4r.v v20, (a3) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    csrr a3, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a3, 2
+; RV64ZVBC32-NEXT:    mv a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 3
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a4
+; RV64ZVBC32-NEXT:    add a3, sp, a3
+; RV64ZVBC32-NEXT:    addi a3, a3, 272
+; RV64ZVBC32-NEXT:    vl4r.v v20, (a3) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v20, v16, v20
+; RV64ZVBC32-NEXT:    csrr a3, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a3, 3
+; RV64ZVBC32-NEXT:    mv a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 3
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a4
+; RV64ZVBC32-NEXT:    add a3, sp, a3
+; RV64ZVBC32-NEXT:    addi a3, a3, 272
+; RV64ZVBC32-NEXT:    vl4r.v v24, (a3) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v20, v20, v24
+; RV64ZVBC32-NEXT:    csrr a3, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a3, 2
+; RV64ZVBC32-NEXT:    mv a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 4
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a4
+; RV64ZVBC32-NEXT:    add a3, sp, a3
+; RV64ZVBC32-NEXT:    addi a3, a3, 272
+; RV64ZVBC32-NEXT:    vl4r.v v24, (a3) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v20, v20, v24
+; RV64ZVBC32-NEXT:    csrr a3, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a3, 6
+; RV64ZVBC32-NEXT:    mv a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a4
+; RV64ZVBC32-NEXT:    add a3, sp, a3
+; RV64ZVBC32-NEXT:    addi a3, a3, 272
+; RV64ZVBC32-NEXT:    vl4r.v v24, (a3) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v20, v20, v24
+; RV64ZVBC32-NEXT:    csrr a3, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a3, 2
+; RV64ZVBC32-NEXT:    mv a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 2
+; RV64ZVBC32-NEXT:    add a3, a3, a4
+; RV64ZVBC32-NEXT:    add a3, sp, a3
+; RV64ZVBC32-NEXT:    addi a3, a3, 272
+; RV64ZVBC32-NEXT:    vl4r.v v24, (a3) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v20, v20, v24
+; RV64ZVBC32-NEXT:    csrr a3, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a3, 3
+; RV64ZVBC32-NEXT:    mv a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 2
+; RV64ZVBC32-NEXT:    add a3, a3, a4
+; RV64ZVBC32-NEXT:    add a3, sp, a3
+; RV64ZVBC32-NEXT:    addi a3, a3, 272
+; RV64ZVBC32-NEXT:    vl4r.v v24, (a3) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v20, v20, v24
+; RV64ZVBC32-NEXT:    csrr a3, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a3, 2
+; RV64ZVBC32-NEXT:    mv a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 2
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 2
+; RV64ZVBC32-NEXT:    add a3, a3, a4
+; RV64ZVBC32-NEXT:    add a3, sp, a3
+; RV64ZVBC32-NEXT:    addi a3, a3, 272
+; RV64ZVBC32-NEXT:    vl4r.v v24, (a3) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v20, v20, v24
+; RV64ZVBC32-NEXT:    csrr a3, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a3, 4
+; RV64ZVBC32-NEXT:    mv a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 2
+; RV64ZVBC32-NEXT:    add a3, a3, a4
+; RV64ZVBC32-NEXT:    add a3, sp, a3
+; RV64ZVBC32-NEXT:    addi a3, a3, 272
+; RV64ZVBC32-NEXT:    vl4r.v v24, (a3) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v20, v20, v24
+; RV64ZVBC32-NEXT:    csrr a3, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a3, 2
+; RV64ZVBC32-NEXT:    mv a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 2
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 2
+; RV64ZVBC32-NEXT:    add a3, a3, a4
+; RV64ZVBC32-NEXT:    add a3, sp, a3
+; RV64ZVBC32-NEXT:    addi a3, a3, 272
+; RV64ZVBC32-NEXT:    vl4r.v v24, (a3) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v24, v20, v24
+; RV64ZVBC32-NEXT:    csrr a3, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a3, 3
+; RV64ZVBC32-NEXT:    mv a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 2
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 2
+; RV64ZVBC32-NEXT:    add a3, a3, a4
+; RV64ZVBC32-NEXT:    add a3, sp, a3
+; RV64ZVBC32-NEXT:    addi a3, a3, 272
+; RV64ZVBC32-NEXT:    vl4r.v v28, (a3) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v28
+; RV64ZVBC32-NEXT:    csrr a3, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a3, 2
+; RV64ZVBC32-NEXT:    mv a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 3
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 2
+; RV64ZVBC32-NEXT:    add a3, a3, a4
+; RV64ZVBC32-NEXT:    add a3, sp, a3
+; RV64ZVBC32-NEXT:    addi a3, a3, 272
+; RV64ZVBC32-NEXT:    vl4r.v v28, (a3) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v28
+; RV64ZVBC32-NEXT:    vsrl.vi v28, v16, 8
+; RV64ZVBC32-NEXT:    ld a4, 232(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v28, v28, a4
+; RV64ZVBC32-NEXT:    vsrl.vi v20, v20, 24
+; RV64ZVBC32-NEXT:    lui a3, 4080
+; RV64ZVBC32-NEXT:    vand.vx v20, v20, a3
+; RV64ZVBC32-NEXT:    vor.vv v20, v28, v20
+; RV64ZVBC32-NEXT:    csrr a5, vlenb
+; RV64ZVBC32-NEXT:    slli a5, a5, 5
+; RV64ZVBC32-NEXT:    mv a6, a5
+; RV64ZVBC32-NEXT:    slli a5, a5, 3
+; RV64ZVBC32-NEXT:    add a5, a5, a6
+; RV64ZVBC32-NEXT:    add a5, sp, a5
+; RV64ZVBC32-NEXT:    addi a5, a5, 272
+; RV64ZVBC32-NEXT:    vl4r.v v28, (a5) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v28
+; RV64ZVBC32-NEXT:    csrr a5, vlenb
+; RV64ZVBC32-NEXT:    slli a5, a5, 2
+; RV64ZVBC32-NEXT:    mv a6, a5
+; RV64ZVBC32-NEXT:    slli a5, a5, 1
+; RV64ZVBC32-NEXT:    add a6, a6, a5
+; RV64ZVBC32-NEXT:    slli a5, a5, 1
+; RV64ZVBC32-NEXT:    add a6, a6, a5
+; RV64ZVBC32-NEXT:    slli a5, a5, 4
+; RV64ZVBC32-NEXT:    add a5, a5, a6
+; RV64ZVBC32-NEXT:    add a5, sp, a5
+; RV64ZVBC32-NEXT:    addi a5, a5, 272
+; RV64ZVBC32-NEXT:    vl4r.v v28, (a5) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v28
+; RV64ZVBC32-NEXT:    csrr a5, vlenb
+; RV64ZVBC32-NEXT:    slli a5, a5, 3
+; RV64ZVBC32-NEXT:    mv a6, a5
+; RV64ZVBC32-NEXT:    slli a5, a5, 1
+; RV64ZVBC32-NEXT:    add a6, a6, a5
+; RV64ZVBC32-NEXT:    slli a5, a5, 4
+; RV64ZVBC32-NEXT:    add a5, a5, a6
+; RV64ZVBC32-NEXT:    add a5, sp, a5
+; RV64ZVBC32-NEXT:    addi a5, a5, 272
+; RV64ZVBC32-NEXT:    vl4r.v v28, (a5) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v28
+; RV64ZVBC32-NEXT:    csrr a5, vlenb
+; RV64ZVBC32-NEXT:    slli a5, a5, 2
+; RV64ZVBC32-NEXT:    mv a6, a5
+; RV64ZVBC32-NEXT:    slli a5, a5, 2
+; RV64ZVBC32-NEXT:    add a6, a6, a5
+; RV64ZVBC32-NEXT:    slli a5, a5, 4
+; RV64ZVBC32-NEXT:    add a5, a5, a6
+; RV64ZVBC32-NEXT:    add a5, sp, a5
+; RV64ZVBC32-NEXT:    addi a5, a5, 272
+; RV64ZVBC32-NEXT:    vl4r.v v28, (a5) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v28
+; RV64ZVBC32-NEXT:    csrr a5, vlenb
+; RV64ZVBC32-NEXT:    slli a5, a5, 4
+; RV64ZVBC32-NEXT:    mv a6, a5
+; RV64ZVBC32-NEXT:    slli a5, a5, 4
+; RV64ZVBC32-NEXT:    add a5, a5, a6
+; RV64ZVBC32-NEXT:    add a5, sp, a5
+; RV64ZVBC32-NEXT:    addi a5, a5, 272
+; RV64ZVBC32-NEXT:    vl4r.v v28, (a5) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v28
+; RV64ZVBC32-NEXT:    csrr a5, vlenb
+; RV64ZVBC32-NEXT:    slli a5, a5, 2
+; RV64ZVBC32-NEXT:    mv a6, a5
+; RV64ZVBC32-NEXT:    slli a5, a5, 1
+; RV64ZVBC32-NEXT:    add a6, a6, a5
+; RV64ZVBC32-NEXT:    slli a5, a5, 5
+; RV64ZVBC32-NEXT:    add a5, a5, a6
+; RV64ZVBC32-NEXT:    add a5, sp, a5
+; RV64ZVBC32-NEXT:    addi a5, a5, 272
+; RV64ZVBC32-NEXT:    vl4r.v v28, (a5) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v28, v24, v28
+; RV64ZVBC32-NEXT:    csrr a5, vlenb
+; RV64ZVBC32-NEXT:    slli a5, a5, 3
+; RV64ZVBC32-NEXT:    mv a6, a5
+; RV64ZVBC32-NEXT:    slli a5, a5, 5
+; RV64ZVBC32-NEXT:    add a5, a5, a6
+; RV64ZVBC32-NEXT:    add a5, sp, a5
+; RV64ZVBC32-NEXT:    addi a5, a5, 272
+; RV64ZVBC32-NEXT:    vl4r.v v4, (a5) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v28, v28, v4
+; RV64ZVBC32-NEXT:    vand.vx v16, v16, a3
+; RV64ZVBC32-NEXT:    vsll.vi v16, v16, 24
+; RV64ZVBC32-NEXT:    vand.vx v4, v24, a4
+; RV64ZVBC32-NEXT:    vsll.vi v4, v4, 8
+; RV64ZVBC32-NEXT:    vor.vv v16, v16, v4
+; RV64ZVBC32-NEXT:    csrr a3, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a3, 2
+; RV64ZVBC32-NEXT:    mv a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 3
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 3
+; RV64ZVBC32-NEXT:    add a3, a3, a4
+; RV64ZVBC32-NEXT:    add a3, sp, a3
+; RV64ZVBC32-NEXT:    addi a3, a3, 272
+; RV64ZVBC32-NEXT:    vl4r.v v4, (a3) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v28, v28, v4
+; RV64ZVBC32-NEXT:    vor.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    csrr a3, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a3, 3
+; RV64ZVBC32-NEXT:    mv a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 2
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 3
+; RV64ZVBC32-NEXT:    add a3, a3, a4
+; RV64ZVBC32-NEXT:    add a3, sp, a3
+; RV64ZVBC32-NEXT:    addi a3, a3, 272
+; RV64ZVBC32-NEXT:    vl4r.v v16, (a3) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v16, v28, v16
+; RV64ZVBC32-NEXT:    vsrl.vx v24, v24, a0
+; RV64ZVBC32-NEXT:    vand.vx v24, v24, a1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v28, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v28
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v28, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v28
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v28, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v28
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 5
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 272
+; RV64ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v8, v16, v8
+; RV64ZVBC32-NEXT:    vsrl.vx v8, v8, a2
+; RV64ZVBC32-NEXT:    vor.vv v8, v24, v8
+; RV64ZVBC32-NEXT:    vor.vv v8, v20, v8
+; RV64ZVBC32-NEXT:    vor.vv v8, v12, v8
+; RV64ZVBC32-NEXT:    vsrl.vi v12, v8, 4
+; RV64ZVBC32-NEXT:    ld a0, 240(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, a0
+; RV64ZVBC32-NEXT:    vand.vx v12, v12, a0
+; RV64ZVBC32-NEXT:    vsll.vi v8, v8, 4
+; RV64ZVBC32-NEXT:    vor.vv v8, v12, v8
+; RV64ZVBC32-NEXT:    vsrl.vi v12, v8, 2
+; RV64ZVBC32-NEXT:    ld a0, 248(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, a0
+; RV64ZVBC32-NEXT:    vand.vx v12, v12, a0
+; RV64ZVBC32-NEXT:    vsll.vi v8, v8, 2
+; RV64ZVBC32-NEXT:    vor.vv v8, v12, v8
+; RV64ZVBC32-NEXT:    vsrl.vi v12, v8, 1
+; RV64ZVBC32-NEXT:    ld a0, 256(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, a0
+; RV64ZVBC32-NEXT:    vand.vx v12, v12, a0
+; RV64ZVBC32-NEXT:    vadd.vv v8, v8, v8
+; RV64ZVBC32-NEXT:    vor.vv v8, v12, v8
+; RV64ZVBC32-NEXT:    vsrl.vi v8, v8, 1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add sp, sp, a0
+; RV64ZVBC32-NEXT:    ld ra, 376(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    ld s0, 368(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    ld s1, 360(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    ld s2, 352(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    ld s3, 344(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    ld s4, 336(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    ld s5, 328(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    ld s6, 320(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    ld s7, 312(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    ld s8, 304(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    ld s9, 296(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    ld s10, 288(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    ld s11, 280(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    addi sp, sp, 384
+; RV64ZVBC32-NEXT:    ret
   %va.ext = zext <vscale x 4 x i64> %va to <vscale x 4 x i128>
   %vb.ext = zext <vscale x 4 x i64> %vb to <vscale x 4 x i128>
   %clmul = call <vscale x 4 x i128> @llvm.clmul.nxv4i128(<vscale x 4 x i128> %va.ext, <vscale x 4 x i128> %vb.ext)
@@ -31664,23 +52570,3409 @@ define <vscale x 4 x i64> @clmulh_nxv4i64_vx(<vscale x 4 x i64> %va, i64 %b) nou
 ; RV64V-NEXT:    addi sp, sp, 16
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmulh_nxv4i64_vx:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    addi sp, sp, -16
-; RV32ZVBC-NEXT:    sw a0, 8(sp)
-; RV32ZVBC-NEXT:    sw a1, 12(sp)
-; RV32ZVBC-NEXT:    addi a0, sp, 8
-; RV32ZVBC-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32ZVBC-NEXT:    vlse64.v v12, (a0), zero
-; RV32ZVBC-NEXT:    vclmulh.vv v8, v8, v12
-; RV32ZVBC-NEXT:    addi sp, sp, 16
-; RV32ZVBC-NEXT:    ret
+; RV32ZVBC64-LABEL: clmulh_nxv4i64_vx:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    addi sp, sp, -16
+; RV32ZVBC64-NEXT:    sw a0, 8(sp)
+; RV32ZVBC64-NEXT:    sw a1, 12(sp)
+; RV32ZVBC64-NEXT:    addi a0, sp, 8
+; RV32ZVBC64-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32ZVBC64-NEXT:    vlse64.v v12, (a0), zero
+; RV32ZVBC64-NEXT:    vclmulh.vv v8, v8, v12
+; RV32ZVBC64-NEXT:    addi sp, sp, 16
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmulh_nxv4i64_vx:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV64ZVBC64-NEXT:    vclmulh.vx v8, v8, a0
+; RV64ZVBC64-NEXT:    ret
+;
+; RV32ZVBC32-LABEL: clmulh_nxv4i64_vx:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    addi sp, sp, -368
+; RV32ZVBC32-NEXT:    sw ra, 364(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s0, 360(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s1, 356(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s2, 352(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s3, 348(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s4, 344(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s5, 340(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s6, 336(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s7, 332(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s8, 328(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s9, 324(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s10, 320(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s11, 316(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a2, vlenb
+; RV32ZVBC32-NEXT:    slli a2, a2, 2
+; RV32ZVBC32-NEXT:    mv a3, a2
+; RV32ZVBC32-NEXT:    slli a2, a2, 1
+; RV32ZVBC32-NEXT:    add a3, a3, a2
+; RV32ZVBC32-NEXT:    slli a2, a2, 1
+; RV32ZVBC32-NEXT:    add a3, a3, a2
+; RV32ZVBC32-NEXT:    slli a2, a2, 1
+; RV32ZVBC32-NEXT:    add a3, a3, a2
+; RV32ZVBC32-NEXT:    slli a2, a2, 3
+; RV32ZVBC32-NEXT:    add a2, a2, a3
+; RV32ZVBC32-NEXT:    sub sp, sp, a2
+; RV32ZVBC32-NEXT:    vsetvli a2, zero, e64, m4, ta, ma
+; RV32ZVBC32-NEXT:    vmv4r.v v0, v8
+; RV32ZVBC32-NEXT:    sw a0, 16(sp)
+; RV32ZVBC32-NEXT:    sw a1, 20(sp)
+; RV32ZVBC32-NEXT:    addi s10, sp, 16
+; RV32ZVBC32-NEXT:    lui s11, 1044480
+; RV32ZVBC32-NEXT:    lui s0, 524288
+; RV32ZVBC32-NEXT:    li a0, 1
+; RV32ZVBC32-NEXT:    li ra, 2
+; RV32ZVBC32-NEXT:    li t5, 4
+; RV32ZVBC32-NEXT:    li t3, 8
+; RV32ZVBC32-NEXT:    li t6, 16
+; RV32ZVBC32-NEXT:    li t4, 32
+; RV32ZVBC32-NEXT:    li t2, 64
+; RV32ZVBC32-NEXT:    li t1, 128
+; RV32ZVBC32-NEXT:    li t0, 256
+; RV32ZVBC32-NEXT:    li a7, 512
+; RV32ZVBC32-NEXT:    li a3, 1024
+; RV32ZVBC32-NEXT:    lui a2, 1
+; RV32ZVBC32-NEXT:    lui a4, 2
+; RV32ZVBC32-NEXT:    lui a1, 4
+; RV32ZVBC32-NEXT:    lui a5, 8
+; RV32ZVBC32-NEXT:    lui s1, 16
+; RV32ZVBC32-NEXT:    lui a6, 32
+; RV32ZVBC32-NEXT:    lui s2, 64
+; RV32ZVBC32-NEXT:    lui s3, 128
+; RV32ZVBC32-NEXT:    lui s4, 256
+; RV32ZVBC32-NEXT:    lui s5, 512
+; RV32ZVBC32-NEXT:    lui s6, 1024
+; RV32ZVBC32-NEXT:    lui s7, 2048
+; RV32ZVBC32-NEXT:    lui s8, 4096
+; RV32ZVBC32-NEXT:    lui s9, 8192
+; RV32ZVBC32-NEXT:    vlse64.v v4, (s10), zero
+; RV32ZVBC32-NEXT:    lui s10, 16384
+; RV32ZVBC32-NEXT:    sw s11, 264(sp)
+; RV32ZVBC32-NEXT:    lui s11, 32768
+; RV32ZVBC32-NEXT:    sw zero, 268(sp)
+; RV32ZVBC32-NEXT:    sw s0, 24(sp)
+; RV32ZVBC32-NEXT:    sw zero, 28(sp)
+; RV32ZVBC32-NEXT:    sw zero, 288(sp)
+; RV32ZVBC32-NEXT:    sw a0, 292(sp)
+; RV32ZVBC32-NEXT:    sw zero, 272(sp)
+; RV32ZVBC32-NEXT:    sw ra, 276(sp)
+; RV32ZVBC32-NEXT:    lui ra, 65536
+; RV32ZVBC32-NEXT:    sw zero, 280(sp)
+; RV32ZVBC32-NEXT:    sw t5, 284(sp)
+; RV32ZVBC32-NEXT:    lui t5, 131072
+; RV32ZVBC32-NEXT:    sw zero, 256(sp)
+; RV32ZVBC32-NEXT:    sw t3, 260(sp)
+; RV32ZVBC32-NEXT:    lui t3, 262144
+; RV32ZVBC32-NEXT:    sw zero, 248(sp)
+; RV32ZVBC32-NEXT:    sw t6, 252(sp)
+; RV32ZVBC32-NEXT:    sw zero, 240(sp)
+; RV32ZVBC32-NEXT:    sw t4, 244(sp)
+; RV32ZVBC32-NEXT:    sw zero, 232(sp)
+; RV32ZVBC32-NEXT:    sw t2, 236(sp)
+; RV32ZVBC32-NEXT:    sw zero, 224(sp)
+; RV32ZVBC32-NEXT:    sw t1, 228(sp)
+; RV32ZVBC32-NEXT:    sw zero, 216(sp)
+; RV32ZVBC32-NEXT:    sw t0, 220(sp)
+; RV32ZVBC32-NEXT:    sw zero, 208(sp)
+; RV32ZVBC32-NEXT:    sw a7, 212(sp)
+; RV32ZVBC32-NEXT:    sw zero, 200(sp)
+; RV32ZVBC32-NEXT:    sw a3, 204(sp)
+; RV32ZVBC32-NEXT:    slli a3, a0, 11
+; RV32ZVBC32-NEXT:    sw zero, 192(sp)
+; RV32ZVBC32-NEXT:    sw a3, 196(sp)
+; RV32ZVBC32-NEXT:    sw zero, 184(sp)
+; RV32ZVBC32-NEXT:    sw a2, 188(sp)
+; RV32ZVBC32-NEXT:    lui t1, 1
+; RV32ZVBC32-NEXT:    sw zero, 176(sp)
+; RV32ZVBC32-NEXT:    sw a4, 180(sp)
+; RV32ZVBC32-NEXT:    lui t4, 2
+; RV32ZVBC32-NEXT:    sw zero, 168(sp)
+; RV32ZVBC32-NEXT:    sw a1, 172(sp)
+; RV32ZVBC32-NEXT:    lui t2, 4
+; RV32ZVBC32-NEXT:    sw zero, 160(sp)
+; RV32ZVBC32-NEXT:    sw a5, 164(sp)
+; RV32ZVBC32-NEXT:    lui t6, 8
+; RV32ZVBC32-NEXT:    sw zero, 152(sp)
+; RV32ZVBC32-NEXT:    sw s1, 156(sp)
+; RV32ZVBC32-NEXT:    sw zero, 144(sp)
+; RV32ZVBC32-NEXT:    sw a6, 148(sp)
+; RV32ZVBC32-NEXT:    sw zero, 136(sp)
+; RV32ZVBC32-NEXT:    sw s2, 140(sp)
+; RV32ZVBC32-NEXT:    sw zero, 128(sp)
+; RV32ZVBC32-NEXT:    sw s3, 132(sp)
+; RV32ZVBC32-NEXT:    sw zero, 120(sp)
+; RV32ZVBC32-NEXT:    sw s4, 124(sp)
+; RV32ZVBC32-NEXT:    sw zero, 112(sp)
+; RV32ZVBC32-NEXT:    sw s5, 116(sp)
+; RV32ZVBC32-NEXT:    sw zero, 104(sp)
+; RV32ZVBC32-NEXT:    sw s6, 108(sp)
+; RV32ZVBC32-NEXT:    sw zero, 96(sp)
+; RV32ZVBC32-NEXT:    sw s7, 100(sp)
+; RV32ZVBC32-NEXT:    sw zero, 88(sp)
+; RV32ZVBC32-NEXT:    sw s8, 92(sp)
+; RV32ZVBC32-NEXT:    sw zero, 80(sp)
+; RV32ZVBC32-NEXT:    sw s9, 84(sp)
+; RV32ZVBC32-NEXT:    sw zero, 72(sp)
+; RV32ZVBC32-NEXT:    sw s10, 76(sp)
+; RV32ZVBC32-NEXT:    sw zero, 64(sp)
+; RV32ZVBC32-NEXT:    sw s11, 68(sp)
+; RV32ZVBC32-NEXT:    sw zero, 56(sp)
+; RV32ZVBC32-NEXT:    sw ra, 60(sp)
+; RV32ZVBC32-NEXT:    sw zero, 48(sp)
+; RV32ZVBC32-NEXT:    sw t5, 52(sp)
+; RV32ZVBC32-NEXT:    sw zero, 40(sp)
+; RV32ZVBC32-NEXT:    sw t3, 44(sp)
+; RV32ZVBC32-NEXT:    sw zero, 32(sp)
+; RV32ZVBC32-NEXT:    sw s0, 36(sp)
+; RV32ZVBC32-NEXT:    addi a1, sp, 264
+; RV32ZVBC32-NEXT:    vlse64.v v12, (a1), zero
+; RV32ZVBC32-NEXT:    li a6, 56
+; RV32ZVBC32-NEXT:    vsrl.vx v16, v8, a6
+; RV32ZVBC32-NEXT:    li a5, 40
+; RV32ZVBC32-NEXT:    vsrl.vx v20, v8, a5
+; RV32ZVBC32-NEXT:    vsll.vx v24, v8, a6
+; RV32ZVBC32-NEXT:    addi a2, s1, -256
+; RV32ZVBC32-NEXT:    vand.vx v20, v20, a2
+; RV32ZVBC32-NEXT:    vand.vx v28, v8, a2
+; RV32ZVBC32-NEXT:    vor.vv v8, v20, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vsll.vx v16, v28, a5
+; RV32ZVBC32-NEXT:    vor.vv v8, v24, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vmv.v.v v20, v4
+; RV32ZVBC32-NEXT:    vsrl.vx v24, v4, a6
+; RV32ZVBC32-NEXT:    vsrl.vx v28, v4, a5
+; RV32ZVBC32-NEXT:    vsll.vx v4, v4, a6
+; RV32ZVBC32-NEXT:    vand.vx v28, v28, a2
+; RV32ZVBC32-NEXT:    vor.vv v28, v28, v24
+; RV32ZVBC32-NEXT:    vand.vx v24, v20, a2
+; RV32ZVBC32-NEXT:    vsll.vx v24, v24, a5
+; RV32ZVBC32-NEXT:    vor.vv v8, v4, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vmv4r.v v8, v0
+; RV32ZVBC32-NEXT:    vsrl.vi v4, v0, 24
+; RV32ZVBC32-NEXT:    lui a4, 4080
+; RV32ZVBC32-NEXT:    vand.vx v4, v4, a4
+; RV32ZVBC32-NEXT:    vsrl.vi v0, v0, 8
+; RV32ZVBC32-NEXT:    vand.vv v0, v0, v12
+; RV32ZVBC32-NEXT:    vor.vv v4, v0, v4
+; RV32ZVBC32-NEXT:    vsrl.vi v0, v20, 24
+; RV32ZVBC32-NEXT:    vand.vx v0, v0, a4
+; RV32ZVBC32-NEXT:    vsrl.vi v16, v20, 8
+; RV32ZVBC32-NEXT:    vand.vv v16, v16, v12
+; RV32ZVBC32-NEXT:    vor.vv v16, v16, v0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v24, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vor.vv v24, v4, v24
+; RV32ZVBC32-NEXT:    vand.vx v4, v8, a4
+; RV32ZVBC32-NEXT:    vsll.vi v4, v4, 24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v8, v8, v12
+; RV32ZVBC32-NEXT:    vsll.vi v8, v8, 8
+; RV32ZVBC32-NEXT:    vor.vv v4, v4, v8
+; RV32ZVBC32-NEXT:    lui a7, 61681
+; RV32ZVBC32-NEXT:    addi a7, a7, -241
+; RV32ZVBC32-NEXT:    vor.vv v8, v16, v28
+; RV32ZVBC32-NEXT:    vand.vx v16, v20, a4
+; RV32ZVBC32-NEXT:    vsll.vi v16, v16, 24
+; RV32ZVBC32-NEXT:    vand.vv v12, v20, v12
+; RV32ZVBC32-NEXT:    vsll.vi v12, v12, 8
+; RV32ZVBC32-NEXT:    vor.vv v12, v16, v12
+; RV32ZVBC32-NEXT:    vsetvli t0, zero, e32, m4, ta, ma
+; RV32ZVBC32-NEXT:    vmv.v.x v0, a7
+; RV32ZVBC32-NEXT:    lui a7, 209715
+; RV32ZVBC32-NEXT:    addi a7, a7, 819
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vsetvli t0, zero, e64, m4, ta, ma
+; RV32ZVBC32-NEXT:    vor.vv v16, v16, v4
+; RV32ZVBC32-NEXT:    vsetvli t0, zero, e32, m4, ta, ma
+; RV32ZVBC32-NEXT:    vmv.v.x v4, a7
+; RV32ZVBC32-NEXT:    lui a7, 349525
+; RV32ZVBC32-NEXT:    addi a7, a7, 1365
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v20, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vsetvli t0, zero, e64, m4, ta, ma
+; RV32ZVBC32-NEXT:    vor.vv v28, v20, v12
+; RV32ZVBC32-NEXT:    vsetvli t0, zero, e32, m4, ta, ma
+; RV32ZVBC32-NEXT:    vmv.v.x v12, a7
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vsetvli a7, zero, e64, m4, ta, ma
+; RV32ZVBC32-NEXT:    vor.vv v24, v16, v24
+; RV32ZVBC32-NEXT:    addi a7, sp, 24
+; RV32ZVBC32-NEXT:    vlse64.v v12, (a7), zero
+; RV32ZVBC32-NEXT:    vor.vv v8, v28, v8
+; RV32ZVBC32-NEXT:    vsrl.vi v16, v24, 4
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v0, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v20, v24, v0
+; RV32ZVBC32-NEXT:    vand.vv v16, v16, v0
+; RV32ZVBC32-NEXT:    vsll.vi v20, v20, 4
+; RV32ZVBC32-NEXT:    vor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    vsrl.vi v20, v8, 4
+; RV32ZVBC32-NEXT:    vand.vv v8, v8, v0
+; RV32ZVBC32-NEXT:    vand.vv v20, v20, v0
+; RV32ZVBC32-NEXT:    vsll.vi v8, v8, 4
+; RV32ZVBC32-NEXT:    vor.vv v8, v20, v8
+; RV32ZVBC32-NEXT:    vsrl.vi v20, v16, 2
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v4, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v16, v16, v4
+; RV32ZVBC32-NEXT:    vand.vv v20, v20, v4
+; RV32ZVBC32-NEXT:    vsll.vi v16, v16, 2
+; RV32ZVBC32-NEXT:    vor.vv v16, v20, v16
+; RV32ZVBC32-NEXT:    vsrl.vi v20, v8, 2
+; RV32ZVBC32-NEXT:    vand.vv v8, v8, v4
+; RV32ZVBC32-NEXT:    vand.vv v20, v20, v4
+; RV32ZVBC32-NEXT:    vsll.vi v8, v8, 2
+; RV32ZVBC32-NEXT:    vor.vv v8, v20, v8
+; RV32ZVBC32-NEXT:    vsrl.vi v20, v16, 1
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v28, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v16, v16, v28
+; RV32ZVBC32-NEXT:    vand.vv v20, v20, v28
+; RV32ZVBC32-NEXT:    vadd.vv v16, v16, v16
+; RV32ZVBC32-NEXT:    vor.vv v24, v20, v16
+; RV32ZVBC32-NEXT:    vsrl.vi v16, v8, 1
+; RV32ZVBC32-NEXT:    vand.vv v8, v8, v28
+; RV32ZVBC32-NEXT:    vand.vv v16, v16, v28
+; RV32ZVBC32-NEXT:    vadd.vv v8, v8, v8
+; RV32ZVBC32-NEXT:    vor.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    addi a7, sp, 288
+; RV32ZVBC32-NEXT:    vlse64.v v16, (a7), zero
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi a7, sp, 272
+; RV32ZVBC32-NEXT:    addi t0, sp, 280
+; RV32ZVBC32-NEXT:    addi a0, sp, 256
+; RV32ZVBC32-NEXT:    vlse64.v v12, (a7), zero
+; RV32ZVBC32-NEXT:    vlse64.v v20, (t0), zero
+; RV32ZVBC32-NEXT:    vlse64.v v28, (a0), zero
+; RV32ZVBC32-NEXT:    vand.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v16, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v20
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v28
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi a1, sp, 248
+; RV32ZVBC32-NEXT:    addi a7, sp, 240
+; RV32ZVBC32-NEXT:    addi t0, sp, 232
+; RV32ZVBC32-NEXT:    addi a0, sp, 224
+; RV32ZVBC32-NEXT:    vlse64.v v12, (a1), zero
+; RV32ZVBC32-NEXT:    vlse64.v v16, (a7), zero
+; RV32ZVBC32-NEXT:    vlse64.v v20, (t0), zero
+; RV32ZVBC32-NEXT:    vlse64.v v28, (a0), zero
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v20
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v28
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi a0, sp, 216
+; RV32ZVBC32-NEXT:    addi a1, sp, 208
+; RV32ZVBC32-NEXT:    addi a7, sp, 200
+; RV32ZVBC32-NEXT:    addi t0, sp, 192
+; RV32ZVBC32-NEXT:    vlse64.v v12, (a0), zero
+; RV32ZVBC32-NEXT:    vlse64.v v16, (a1), zero
+; RV32ZVBC32-NEXT:    vlse64.v v20, (a7), zero
+; RV32ZVBC32-NEXT:    vlse64.v v28, (t0), zero
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 8
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v20
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v28
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi a0, sp, 184
+; RV32ZVBC32-NEXT:    addi a1, sp, 176
+; RV32ZVBC32-NEXT:    addi a7, sp, 168
+; RV32ZVBC32-NEXT:    addi t0, sp, 160
+; RV32ZVBC32-NEXT:    vlse64.v v12, (a0), zero
+; RV32ZVBC32-NEXT:    vlse64.v v16, (a1), zero
+; RV32ZVBC32-NEXT:    vlse64.v v20, (a7), zero
+; RV32ZVBC32-NEXT:    vlse64.v v28, (t0), zero
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v20
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v28
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi a0, sp, 152
+; RV32ZVBC32-NEXT:    addi a1, sp, 144
+; RV32ZVBC32-NEXT:    addi a7, sp, 136
+; RV32ZVBC32-NEXT:    addi t0, sp, 128
+; RV32ZVBC32-NEXT:    vlse64.v v12, (a0), zero
+; RV32ZVBC32-NEXT:    vlse64.v v16, (a1), zero
+; RV32ZVBC32-NEXT:    vlse64.v v20, (a7), zero
+; RV32ZVBC32-NEXT:    vlse64.v v28, (t0), zero
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v20
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v28
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi a0, sp, 120
+; RV32ZVBC32-NEXT:    addi a1, sp, 112
+; RV32ZVBC32-NEXT:    addi a7, sp, 104
+; RV32ZVBC32-NEXT:    addi t0, sp, 96
+; RV32ZVBC32-NEXT:    vlse64.v v12, (a0), zero
+; RV32ZVBC32-NEXT:    vlse64.v v16, (a1), zero
+; RV32ZVBC32-NEXT:    vlse64.v v20, (a7), zero
+; RV32ZVBC32-NEXT:    vlse64.v v28, (t0), zero
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v20
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v28
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi a0, sp, 88
+; RV32ZVBC32-NEXT:    addi a1, sp, 80
+; RV32ZVBC32-NEXT:    addi a7, sp, 72
+; RV32ZVBC32-NEXT:    addi t0, sp, 64
+; RV32ZVBC32-NEXT:    vlse64.v v12, (a0), zero
+; RV32ZVBC32-NEXT:    vlse64.v v16, (a1), zero
+; RV32ZVBC32-NEXT:    vlse64.v v20, (a7), zero
+; RV32ZVBC32-NEXT:    vlse64.v v28, (t0), zero
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v20
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v28
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi a0, sp, 56
+; RV32ZVBC32-NEXT:    addi a1, sp, 48
+; RV32ZVBC32-NEXT:    addi a7, sp, 40
+; RV32ZVBC32-NEXT:    addi t0, sp, 32
+; RV32ZVBC32-NEXT:    vlse64.v v12, (a0), zero
+; RV32ZVBC32-NEXT:    vlse64.v v16, (a1), zero
+; RV32ZVBC32-NEXT:    vlse64.v v20, (a7), zero
+; RV32ZVBC32-NEXT:    vlse64.v v28, (t0), zero
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v20
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v12, v8, v28
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    li a0, 16
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, a0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    li a0, 32
+; RV32ZVBC32-NEXT:    vand.vx v28, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 64
+; RV32ZVBC32-NEXT:    vand.vx v4, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 128
+; RV32ZVBC32-NEXT:    vand.vx v0, v8, a0
+; RV32ZVBC32-NEXT:    li a0, 256
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, a0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    li a0, 512
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, a0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    li a0, 1024
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, a0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, a3
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, t1
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, t4
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, t2
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, t6
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, s1
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 7
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    lui a0, 32
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, a0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, s2
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, s3
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, s4
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, s5
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, s6
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, s7
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, s8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, s9
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, s10
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, s11
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, ra
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, t5
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v12, v8, t3
+; RV32ZVBC32-NEXT:    addi a0, sp, 304
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vi v12, v8, 2
+; RV32ZVBC32-NEXT:    vand.vi v16, v8, 1
+; RV32ZVBC32-NEXT:    vand.vi v20, v8, 4
+; RV32ZVBC32-NEXT:    vand.vi v8, v8, 8
+; RV32ZVBC32-NEXT:    vmul.vv v12, v24, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    vmul.vv v12, v24, v16
+; RV32ZVBC32-NEXT:    vmul.vv v16, v24, v20
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v20, v24, v8
+; RV32ZVBC32-NEXT:    vmul.vv v28, v24, v28
+; RV32ZVBC32-NEXT:    vmul.vv v4, v24, v4
+; RV32ZVBC32-NEXT:    vmul.vv v0, v24, v0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 7
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi a0, sp, 304
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 7
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 8
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 8
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v24, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v12, v8
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v12
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v20
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v28
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v4
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v8, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v20, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    vsll.vx v8, v8, a6
+; RV32ZVBC32-NEXT:    vand.vx v12, v12, a2
+; RV32ZVBC32-NEXT:    vsll.vx v12, v12, a5
+; RV32ZVBC32-NEXT:    vor.vv v8, v8, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v16, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 7
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v12, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v20, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v20, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v20, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v20, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v20, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v20, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v20, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v20, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v20, v16, v20
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v28, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v20, v20, v28
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v28, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v20, v20, v28
+; RV32ZVBC32-NEXT:    vsrl.vi v28, v12, 8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v4, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v28, v28, v4
+; RV32ZVBC32-NEXT:    vsrl.vi v16, v16, 24
+; RV32ZVBC32-NEXT:    vand.vx v16, v16, a4
+; RV32ZVBC32-NEXT:    vor.vv v16, v28, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v28, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v20, v20, v28
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v28, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v20, v20, v28
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v28, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v20, v20, v28
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v28, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v20, v20, v28
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v28, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v20, v20, v28
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 8
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v28, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v28, v20, v28
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v0, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v28, v28, v0
+; RV32ZVBC32-NEXT:    vand.vx v12, v12, a4
+; RV32ZVBC32-NEXT:    vsll.vi v12, v12, 24
+; RV32ZVBC32-NEXT:    vand.vv v4, v20, v4
+; RV32ZVBC32-NEXT:    vsll.vi v4, v4, 8
+; RV32ZVBC32-NEXT:    vor.vv v12, v12, v4
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v4, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v28, v28, v4
+; RV32ZVBC32-NEXT:    vor.vv v8, v8, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v28, v12
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v28, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v28
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v28, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v28
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v28, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v28
+; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v24
+; RV32ZVBC32-NEXT:    vsrl.vx v20, v20, a5
+; RV32ZVBC32-NEXT:    vand.vx v20, v20, a2
+; RV32ZVBC32-NEXT:    vsrl.vx v12, v12, a6
+; RV32ZVBC32-NEXT:    vor.vv v12, v20, v12
+; RV32ZVBC32-NEXT:    vor.vv v12, v16, v12
+; RV32ZVBC32-NEXT:    vor.vv v8, v8, v12
+; RV32ZVBC32-NEXT:    vsrl.vi v12, v8, 4
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    vand.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    vsll.vi v8, v8, 4
+; RV32ZVBC32-NEXT:    vor.vv v8, v12, v8
+; RV32ZVBC32-NEXT:    vsrl.vi v12, v8, 2
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    vand.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    vsll.vi v8, v8, 2
+; RV32ZVBC32-NEXT:    vor.vv v8, v12, v8
+; RV32ZVBC32-NEXT:    vsrl.vi v12, v8, 1
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    vand.vv v12, v12, v16
+; RV32ZVBC32-NEXT:    vadd.vv v8, v8, v8
+; RV32ZVBC32-NEXT:    vor.vv v8, v12, v8
+; RV32ZVBC32-NEXT:    vsrl.vi v8, v8, 1
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add sp, sp, a0
+; RV32ZVBC32-NEXT:    lw ra, 364(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s0, 360(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s1, 356(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s2, 352(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s3, 348(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s4, 344(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s5, 340(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s6, 336(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s7, 332(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s8, 328(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s9, 324(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s10, 320(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s11, 316(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    addi sp, sp, 368
+; RV32ZVBC32-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmulh_nxv4i64_vx:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV64ZVBC-NEXT:    vclmulh.vx v8, v8, a0
-; RV64ZVBC-NEXT:    ret
+; RV64ZVBC32-LABEL: clmulh_nxv4i64_vx:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    addi sp, sp, -16
+; RV64ZVBC32-NEXT:    csrr a1, vlenb
+; RV64ZVBC32-NEXT:    slli a1, a1, 2
+; RV64ZVBC32-NEXT:    sub sp, sp, a1
+; RV64ZVBC32-NEXT:    li a1, 56
+; RV64ZVBC32-NEXT:    lui t2, 16
+; RV64ZVBC32-NEXT:    lui a2, 4080
+; RV64ZVBC32-NEXT:    li t0, 255
+; RV64ZVBC32-NEXT:    lui a3, 61681
+; RV64ZVBC32-NEXT:    lui a4, 209715
+; RV64ZVBC32-NEXT:    lui a5, 349525
+; RV64ZVBC32-NEXT:    srli a6, a0, 24
+; RV64ZVBC32-NEXT:    srli a7, a0, 8
+; RV64ZVBC32-NEXT:    srli t1, a0, 40
+; RV64ZVBC32-NEXT:    srli t3, a0, 56
+; RV64ZVBC32-NEXT:    addi a3, a3, -241
+; RV64ZVBC32-NEXT:    addi a4, a4, 819
+; RV64ZVBC32-NEXT:    addi t4, a5, 1365
+; RV64ZVBC32-NEXT:    slli a5, a3, 32
+; RV64ZVBC32-NEXT:    add a5, a3, a5
+; RV64ZVBC32-NEXT:    slli a3, a4, 32
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, t4, 32
+; RV64ZVBC32-NEXT:    add a3, t4, a3
+; RV64ZVBC32-NEXT:    srliw t4, a0, 24
+; RV64ZVBC32-NEXT:    slli t0, t0, 24
+; RV64ZVBC32-NEXT:    and a6, a6, a2
+; RV64ZVBC32-NEXT:    and a7, a7, t0
+; RV64ZVBC32-NEXT:    or t5, a7, a6
+; RV64ZVBC32-NEXT:    addi a6, t2, -256
+; RV64ZVBC32-NEXT:    and a7, t1, a6
+; RV64ZVBC32-NEXT:    or t1, a7, t3
+; RV64ZVBC32-NEXT:    and a7, a0, a2
+; RV64ZVBC32-NEXT:    slli t4, t4, 32
+; RV64ZVBC32-NEXT:    slli a7, a7, 24
+; RV64ZVBC32-NEXT:    or t3, a7, t4
+; RV64ZVBC32-NEXT:    li a7, 40
+; RV64ZVBC32-NEXT:    vsetvli t4, zero, e64, m4, ta, ma
+; RV64ZVBC32-NEXT:    vsrl.vi v16, v8, 24
+; RV64ZVBC32-NEXT:    vsrl.vi v12, v8, 8
+; RV64ZVBC32-NEXT:    or t1, t5, t1
+; RV64ZVBC32-NEXT:    slli t4, a0, 56
+; RV64ZVBC32-NEXT:    and a0, a0, a6
+; RV64ZVBC32-NEXT:    slli a0, a0, 40
+; RV64ZVBC32-NEXT:    or t4, t4, a0
+; RV64ZVBC32-NEXT:    li a0, 1
+; RV64ZVBC32-NEXT:    or t4, t4, t3
+; RV64ZVBC32-NEXT:    lui t3, 1
+; RV64ZVBC32-NEXT:    vsrl.vx v20, v8, a1
+; RV64ZVBC32-NEXT:    vsrl.vx v24, v8, a7
+; RV64ZVBC32-NEXT:    vand.vx v16, v16, a2
+; RV64ZVBC32-NEXT:    vand.vx v28, v8, a2
+; RV64ZVBC32-NEXT:    vsll.vx v4, v8, a1
+; RV64ZVBC32-NEXT:    vand.vx v24, v24, a6
+; RV64ZVBC32-NEXT:    vand.vx v12, v12, t0
+; RV64ZVBC32-NEXT:    vsll.vi v28, v28, 24
+; RV64ZVBC32-NEXT:    vor.vv v20, v24, v20
+; RV64ZVBC32-NEXT:    vand.vx v24, v8, t0
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, a6
+; RV64ZVBC32-NEXT:    vor.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    vsll.vi v16, v24, 8
+; RV64ZVBC32-NEXT:    vsll.vx v8, v8, a7
+; RV64ZVBC32-NEXT:    vor.vv v12, v12, v20
+; RV64ZVBC32-NEXT:    vor.vv v16, v28, v16
+; RV64ZVBC32-NEXT:    vor.vv v8, v4, v8
+; RV64ZVBC32-NEXT:    vor.vv v8, v8, v16
+; RV64ZVBC32-NEXT:    vor.vv v8, v8, v12
+; RV64ZVBC32-NEXT:    or t1, t4, t1
+; RV64ZVBC32-NEXT:    vsrl.vi v12, v8, 4
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, a5
+; RV64ZVBC32-NEXT:    srli t4, t1, 4
+; RV64ZVBC32-NEXT:    and t1, t1, a5
+; RV64ZVBC32-NEXT:    vand.vx v12, v12, a5
+; RV64ZVBC32-NEXT:    vsll.vi v8, v8, 4
+; RV64ZVBC32-NEXT:    and t4, t4, a5
+; RV64ZVBC32-NEXT:    slli t1, t1, 4
+; RV64ZVBC32-NEXT:    vor.vv v8, v12, v8
+; RV64ZVBC32-NEXT:    or t1, t4, t1
+; RV64ZVBC32-NEXT:    vsrl.vi v12, v8, 2
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, a4
+; RV64ZVBC32-NEXT:    srli t4, t1, 2
+; RV64ZVBC32-NEXT:    and t1, t1, a4
+; RV64ZVBC32-NEXT:    vand.vx v12, v12, a4
+; RV64ZVBC32-NEXT:    vsll.vi v8, v8, 2
+; RV64ZVBC32-NEXT:    and t4, t4, a4
+; RV64ZVBC32-NEXT:    slli t1, t1, 2
+; RV64ZVBC32-NEXT:    vor.vv v8, v12, v8
+; RV64ZVBC32-NEXT:    or t1, t4, t1
+; RV64ZVBC32-NEXT:    vsrl.vi v12, v8, 1
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, a3
+; RV64ZVBC32-NEXT:    srli t4, t1, 1
+; RV64ZVBC32-NEXT:    and t1, t1, a3
+; RV64ZVBC32-NEXT:    vand.vx v12, v12, a3
+; RV64ZVBC32-NEXT:    vadd.vv v8, v8, v8
+; RV64ZVBC32-NEXT:    and t4, t4, a3
+; RV64ZVBC32-NEXT:    slli t1, t1, 1
+; RV64ZVBC32-NEXT:    vor.vv v8, v12, v8
+; RV64ZVBC32-NEXT:    or t1, t4, t1
+; RV64ZVBC32-NEXT:    andi t4, t1, 2
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, t4
+; RV64ZVBC32-NEXT:    andi t4, t1, 1
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, t4
+; RV64ZVBC32-NEXT:    andi t4, t1, 4
+; RV64ZVBC32-NEXT:    vmul.vx v20, v8, t4
+; RV64ZVBC32-NEXT:    andi t4, t1, 8
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, t4
+; RV64ZVBC32-NEXT:    andi t4, t1, 16
+; RV64ZVBC32-NEXT:    vmul.vx v28, v8, t4
+; RV64ZVBC32-NEXT:    andi t4, t1, 32
+; RV64ZVBC32-NEXT:    vmul.vx v4, v8, t4
+; RV64ZVBC32-NEXT:    andi t4, t1, 64
+; RV64ZVBC32-NEXT:    vxor.vv v12, v16, v12
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, t4
+; RV64ZVBC32-NEXT:    andi t4, t1, 128
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v20
+; RV64ZVBC32-NEXT:    vmul.vx v20, v8, t4
+; RV64ZVBC32-NEXT:    andi t4, t1, 256
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, t4
+; RV64ZVBC32-NEXT:    andi t4, t1, 512
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v28
+; RV64ZVBC32-NEXT:    vmul.vx v28, v8, t4
+; RV64ZVBC32-NEXT:    andi t4, t1, 1024
+; RV64ZVBC32-NEXT:    vxor.vv v4, v12, v4
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, t4
+; RV64ZVBC32-NEXT:    slli t4, a0, 11
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v4, v4, v16
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, t4
+; RV64ZVBC32-NEXT:    lui t4, 2
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vxor.vv v4, v4, v20
+; RV64ZVBC32-NEXT:    addi t5, sp, 16
+; RV64ZVBC32-NEXT:    vs4r.v v4, (t5) # vscale x 32-byte Folded Spill
+; RV64ZVBC32-NEXT:    vmul.vx v20, v8, t3
+; RV64ZVBC32-NEXT:    lui t3, 4
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v24, v4, v24
+; RV64ZVBC32-NEXT:    vxor.vv v28, v24, v28
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, t4
+; RV64ZVBC32-NEXT:    lui t4, 8
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vxor.vv v12, v28, v12
+; RV64ZVBC32-NEXT:    vmul.vx v28, v8, t3
+; RV64ZVBC32-NEXT:    lui t3, 32
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    vmul.vx v4, v8, t4
+; RV64ZVBC32-NEXT:    lui t4, 64
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v20
+; RV64ZVBC32-NEXT:    vmul.vx v0, v8, t2
+; RV64ZVBC32-NEXT:    lui t2, 128
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vxor.vv v16, v12, v24
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, t3
+; RV64ZVBC32-NEXT:    lui t3, 256
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v20, v16, v28
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, t4
+; RV64ZVBC32-NEXT:    lui t4, 512
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vxor.vv v24, v20, v4
+; RV64ZVBC32-NEXT:    vmul.vx v20, v8, t2
+; RV64ZVBC32-NEXT:    lui t2, 1024
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vxor.vv v28, v24, v0
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, t3
+; RV64ZVBC32-NEXT:    lui t3, 2048
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v12, v28, v12
+; RV64ZVBC32-NEXT:    vmul.vx v28, v8, t4
+; RV64ZVBC32-NEXT:    lui t4, 4096
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v12, v16
+; RV64ZVBC32-NEXT:    vmul.vx v12, v8, t2
+; RV64ZVBC32-NEXT:    lui t2, 8192
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vmul.vx v20, v8, t3
+; RV64ZVBC32-NEXT:    lui t3, 16384
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, t4
+; RV64ZVBC32-NEXT:    lui t4, 32768
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v28
+; RV64ZVBC32-NEXT:    vmul.vx v28, v8, t2
+; RV64ZVBC32-NEXT:    lui t2, 65536
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vxor.vv v12, v16, v12
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, t3
+; RV64ZVBC32-NEXT:    lui t3, 131072
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v20
+; RV64ZVBC32-NEXT:    vmul.vx v20, v8, t4
+; RV64ZVBC32-NEXT:    lui t4, 262144
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vxor.vv v24, v12, v24
+; RV64ZVBC32-NEXT:    vxor.vv v4, v24, v28
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, t2
+; RV64ZVBC32-NEXT:    slli t2, a0, 32
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vl4r.v v28, (t5) # vscale x 32-byte Folded Reload
+; RV64ZVBC32-NEXT:    vsll.vx v28, v28, a1
+; RV64ZVBC32-NEXT:    vand.vx v12, v12, a6
+; RV64ZVBC32-NEXT:    vsll.vx v12, v12, a7
+; RV64ZVBC32-NEXT:    vor.vv v12, v28, v12
+; RV64ZVBC32-NEXT:    vmul.vx v28, v8, t3
+; RV64ZVBC32-NEXT:    slli t3, a0, 33
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v16, v4, v16
+; RV64ZVBC32-NEXT:    vmul.vx v4, v8, t4
+; RV64ZVBC32-NEXT:    slli t4, a0, 34
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC32-NEXT:    vmul.vx v20, v8, t2
+; RV64ZVBC32-NEXT:    slli t2, a0, 35
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, t3
+; RV64ZVBC32-NEXT:    slli t3, a0, 36
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v28, v16, v28
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, t4
+; RV64ZVBC32-NEXT:    srliw t4, t1, 31
+; RV64ZVBC32-NEXT:    slli t4, t4, 31
+; RV64ZVBC32-NEXT:    vxor.vv v28, v28, v4
+; RV64ZVBC32-NEXT:    vmul.vx v4, v8, t4
+; RV64ZVBC32-NEXT:    slli t4, a0, 37
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vxor.vv v4, v28, v4
+; RV64ZVBC32-NEXT:    vmul.vx v28, v8, t2
+; RV64ZVBC32-NEXT:    slli t2, a0, 38
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vxor.vv v4, v4, v20
+; RV64ZVBC32-NEXT:    vmul.vx v20, v8, t3
+; RV64ZVBC32-NEXT:    slli t3, a0, 39
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v4, v4, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, t4
+; RV64ZVBC32-NEXT:    slli t4, a0, 40
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vxor.vv v4, v4, v16
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, t2
+; RV64ZVBC32-NEXT:    slli t2, a0, 41
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vxor.vv v4, v4, v28
+; RV64ZVBC32-NEXT:    vmul.vx v28, v8, t3
+; RV64ZVBC32-NEXT:    slli t3, a0, 42
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v4, v4, v20
+; RV64ZVBC32-NEXT:    vmul.vx v20, v8, t4
+; RV64ZVBC32-NEXT:    slli t4, a0, 43
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vxor.vv v4, v4, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, t2
+; RV64ZVBC32-NEXT:    slli t2, a0, 44
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vxor.vv v16, v4, v16
+; RV64ZVBC32-NEXT:    vmul.vx v4, v8, t3
+; RV64ZVBC32-NEXT:    slli t3, a0, 45
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v28
+; RV64ZVBC32-NEXT:    vmul.vx v28, v8, t4
+; RV64ZVBC32-NEXT:    slli t4, a0, 46
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vxor.vv v20, v16, v20
+; RV64ZVBC32-NEXT:    vxor.vv v24, v20, v24
+; RV64ZVBC32-NEXT:    vmul.vx v20, v8, t2
+; RV64ZVBC32-NEXT:    slli t2, a0, 47
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vxor.vv v4, v24, v4
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, t3
+; RV64ZVBC32-NEXT:    slli t3, a0, 48
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v4, v4, v28
+; RV64ZVBC32-NEXT:    vmul.vx v28, v8, t4
+; RV64ZVBC32-NEXT:    slli t4, a0, 49
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vxor.vv v4, v4, v20
+; RV64ZVBC32-NEXT:    vmul.vx v20, v8, t2
+; RV64ZVBC32-NEXT:    slli t2, a0, 50
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vxor.vv v4, v4, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, t3
+; RV64ZVBC32-NEXT:    slli t3, a0, 51
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v28, v4, v28
+; RV64ZVBC32-NEXT:    vmul.vx v4, v8, t4
+; RV64ZVBC32-NEXT:    slli t4, a0, 52
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vxor.vv v20, v28, v20
+; RV64ZVBC32-NEXT:    vmul.vx v28, v8, t2
+; RV64ZVBC32-NEXT:    slli t2, a0, 53
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vxor.vv v24, v20, v24
+; RV64ZVBC32-NEXT:    vxor.vv v4, v24, v4
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, t3
+; RV64ZVBC32-NEXT:    slli t3, a0, 54
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v28, v4, v28
+; RV64ZVBC32-NEXT:    vsrl.vi v4, v16, 8
+; RV64ZVBC32-NEXT:    vand.vx v4, v4, t0
+; RV64ZVBC32-NEXT:    vsrl.vi v20, v20, 24
+; RV64ZVBC32-NEXT:    vand.vx v20, v20, a2
+; RV64ZVBC32-NEXT:    vor.vv v20, v4, v20
+; RV64ZVBC32-NEXT:    vmul.vx v4, v8, t4
+; RV64ZVBC32-NEXT:    slli t4, a0, 55
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vxor.vv v24, v28, v24
+; RV64ZVBC32-NEXT:    vmul.vx v28, v8, t2
+; RV64ZVBC32-NEXT:    slli t2, a0, 56
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v4
+; RV64ZVBC32-NEXT:    vmul.vx v4, v8, t3
+; RV64ZVBC32-NEXT:    slli t3, a0, 57
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v28
+; RV64ZVBC32-NEXT:    vmul.vx v0, v8, t4
+; RV64ZVBC32-NEXT:    slli t4, a0, 58
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v4
+; RV64ZVBC32-NEXT:    vmul.vx v28, v8, t2
+; RV64ZVBC32-NEXT:    slli t2, a0, 59
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vmul.vx v4, v8, t3
+; RV64ZVBC32-NEXT:    slli t3, a0, 60
+; RV64ZVBC32-NEXT:    vand.vx v16, v16, a2
+; RV64ZVBC32-NEXT:    slli a2, a0, 61
+; RV64ZVBC32-NEXT:    slli a0, a0, 62
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    and a2, t1, a2
+; RV64ZVBC32-NEXT:    and a0, t1, a0
+; RV64ZVBC32-NEXT:    srli t1, t1, 63
+; RV64ZVBC32-NEXT:    vsll.vi v16, v16, 24
+; RV64ZVBC32-NEXT:    vxor.vv v28, v24, v28
+; RV64ZVBC32-NEXT:    vxor.vv v28, v28, v4
+; RV64ZVBC32-NEXT:    vand.vx v4, v24, t0
+; RV64ZVBC32-NEXT:    vsll.vi v4, v4, 8
+; RV64ZVBC32-NEXT:    vor.vv v16, v16, v4
+; RV64ZVBC32-NEXT:    vmul.vx v4, v8, t4
+; RV64ZVBC32-NEXT:    vxor.vv v28, v28, v4
+; RV64ZVBC32-NEXT:    vmul.vx v4, v8, t2
+; RV64ZVBC32-NEXT:    vor.vv v12, v12, v16
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, t3
+; RV64ZVBC32-NEXT:    vxor.vv v28, v28, v4
+; RV64ZVBC32-NEXT:    vmul.vx v4, v8, a2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v28, v16
+; RV64ZVBC32-NEXT:    vmul.vx v28, v8, a0
+; RV64ZVBC32-NEXT:    slli t1, t1, 63
+; RV64ZVBC32-NEXT:    vmul.vx v8, v8, t1
+; RV64ZVBC32-NEXT:    vsrl.vx v24, v24, a7
+; RV64ZVBC32-NEXT:    vand.vx v24, v24, a6
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v4
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v28
+; RV64ZVBC32-NEXT:    vxor.vv v8, v16, v8
+; RV64ZVBC32-NEXT:    vsrl.vx v8, v8, a1
+; RV64ZVBC32-NEXT:    vor.vv v8, v24, v8
+; RV64ZVBC32-NEXT:    vor.vv v8, v20, v8
+; RV64ZVBC32-NEXT:    vor.vv v8, v12, v8
+; RV64ZVBC32-NEXT:    vsrl.vi v12, v8, 4
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, a5
+; RV64ZVBC32-NEXT:    vand.vx v12, v12, a5
+; RV64ZVBC32-NEXT:    vsll.vi v8, v8, 4
+; RV64ZVBC32-NEXT:    vor.vv v8, v12, v8
+; RV64ZVBC32-NEXT:    vsrl.vi v12, v8, 2
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, a4
+; RV64ZVBC32-NEXT:    vand.vx v12, v12, a4
+; RV64ZVBC32-NEXT:    vsll.vi v8, v8, 2
+; RV64ZVBC32-NEXT:    vor.vv v8, v12, v8
+; RV64ZVBC32-NEXT:    vsrl.vi v12, v8, 1
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, a3
+; RV64ZVBC32-NEXT:    vand.vx v12, v12, a3
+; RV64ZVBC32-NEXT:    vadd.vv v8, v8, v8
+; RV64ZVBC32-NEXT:    vor.vv v8, v12, v8
+; RV64ZVBC32-NEXT:    vsrl.vi v8, v8, 1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add sp, sp, a0
+; RV64ZVBC32-NEXT:    addi sp, sp, 16
+; RV64ZVBC32-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x i64> poison, i64 %b, i128 0
   %vb = shufflevector <vscale x 4 x i64> %elt.head, <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
   %va.ext = zext <vscale x 4 x i64> %va to <vscale x 4 x i128>
@@ -38954,17 +63246,7279 @@ define <vscale x 8 x i64> @clmulh_nxv8i64_vv(<vscale x 8 x i64> %va, <vscale x 8
 ; RV64V-NEXT:    addi sp, sp, 416
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmulh_nxv8i64_vv:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV32ZVBC-NEXT:    vclmulh.vv v8, v8, v16
-; RV32ZVBC-NEXT:    ret
+; RV32ZVBC64-LABEL: clmulh_nxv8i64_vv:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV32ZVBC64-NEXT:    vclmulh.vv v8, v8, v16
+; RV32ZVBC64-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmulh_nxv8i64_vv:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV64ZVBC-NEXT:    vclmulh.vv v8, v8, v16
-; RV64ZVBC-NEXT:    ret
+; RV64ZVBC64-LABEL: clmulh_nxv8i64_vv:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV64ZVBC64-NEXT:    vclmulh.vv v8, v8, v16
+; RV64ZVBC64-NEXT:    ret
+;
+; RV32ZVBC32-LABEL: clmulh_nxv8i64_vv:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    addi sp, sp, -352
+; RV32ZVBC32-NEXT:    sw ra, 348(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s0, 344(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s1, 340(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s2, 336(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s3, 332(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s4, 328(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s5, 324(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s6, 320(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s7, 316(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s8, 312(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s9, 308(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s10, 304(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s11, 300(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    sub sp, sp, a0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    lui t6, 16
+; RV32ZVBC32-NEXT:    li t5, 56
+; RV32ZVBC32-NEXT:    li t4, 40
+; RV32ZVBC32-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
+; RV32ZVBC32-NEXT:    vsrl.vx v24, v8, t5
+; RV32ZVBC32-NEXT:    vsrl.vx v0, v8, t4
+; RV32ZVBC32-NEXT:    addi t3, t6, -256
+; RV32ZVBC32-NEXT:    vand.vx v0, v0, t3
+; RV32ZVBC32-NEXT:    vor.vv v24, v0, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 7
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vsrl.vx v24, v16, t4
+; RV32ZVBC32-NEXT:    vand.vx v24, v24, t3
+; RV32ZVBC32-NEXT:    vsrl.vx v0, v16, t5
+; RV32ZVBC32-NEXT:    vor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v24, v8, t3
+; RV32ZVBC32-NEXT:    vsll.vx v24, v24, t4
+; RV32ZVBC32-NEXT:    vsll.vx v0, v8, t5
+; RV32ZVBC32-NEXT:    vor.vv v8, v0, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v8, v16, t3
+; RV32ZVBC32-NEXT:    vsll.vx v8, v8, t4
+; RV32ZVBC32-NEXT:    vsll.vx v0, v16, t5
+; RV32ZVBC32-NEXT:    vor.vv v8, v0, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    lui a5, 1044480
+; RV32ZVBC32-NEXT:    lui a4, 524288
+; RV32ZVBC32-NEXT:    li ra, 1
+; RV32ZVBC32-NEXT:    li a6, 2
+; RV32ZVBC32-NEXT:    li a7, 4
+; RV32ZVBC32-NEXT:    li s0, 8
+; RV32ZVBC32-NEXT:    li s11, 16
+; RV32ZVBC32-NEXT:    li s10, 32
+; RV32ZVBC32-NEXT:    li s9, 64
+; RV32ZVBC32-NEXT:    li s8, 128
+; RV32ZVBC32-NEXT:    li s7, 256
+; RV32ZVBC32-NEXT:    li s6, 512
+; RV32ZVBC32-NEXT:    li s5, 1024
+; RV32ZVBC32-NEXT:    lui s4, 1
+; RV32ZVBC32-NEXT:    lui s3, 2
+; RV32ZVBC32-NEXT:    lui s2, 4
+; RV32ZVBC32-NEXT:    lui s1, 8
+; RV32ZVBC32-NEXT:    lui a0, 32
+; RV32ZVBC32-NEXT:    lui a1, 64
+; RV32ZVBC32-NEXT:    lui a2, 128
+; RV32ZVBC32-NEXT:    lui a3, 256
+; RV32ZVBC32-NEXT:    lui t1, 512
+; RV32ZVBC32-NEXT:    lui t0, 1024
+; RV32ZVBC32-NEXT:    lui t2, 2048
+; RV32ZVBC32-NEXT:    sw a5, 248(sp)
+; RV32ZVBC32-NEXT:    lui a5, 4096
+; RV32ZVBC32-NEXT:    sw zero, 252(sp)
+; RV32ZVBC32-NEXT:    sw a4, 8(sp)
+; RV32ZVBC32-NEXT:    sw zero, 12(sp)
+; RV32ZVBC32-NEXT:    sw zero, 272(sp)
+; RV32ZVBC32-NEXT:    sw ra, 276(sp)
+; RV32ZVBC32-NEXT:    sw zero, 256(sp)
+; RV32ZVBC32-NEXT:    sw a6, 260(sp)
+; RV32ZVBC32-NEXT:    lui a6, 8192
+; RV32ZVBC32-NEXT:    sw zero, 264(sp)
+; RV32ZVBC32-NEXT:    sw a7, 268(sp)
+; RV32ZVBC32-NEXT:    lui a7, 16384
+; RV32ZVBC32-NEXT:    sw zero, 240(sp)
+; RV32ZVBC32-NEXT:    sw s0, 244(sp)
+; RV32ZVBC32-NEXT:    lui s0, 32768
+; RV32ZVBC32-NEXT:    sw zero, 232(sp)
+; RV32ZVBC32-NEXT:    sw s11, 236(sp)
+; RV32ZVBC32-NEXT:    sw zero, 224(sp)
+; RV32ZVBC32-NEXT:    sw s10, 228(sp)
+; RV32ZVBC32-NEXT:    sw zero, 216(sp)
+; RV32ZVBC32-NEXT:    sw s9, 220(sp)
+; RV32ZVBC32-NEXT:    sw zero, 208(sp)
+; RV32ZVBC32-NEXT:    sw s8, 212(sp)
+; RV32ZVBC32-NEXT:    sw zero, 200(sp)
+; RV32ZVBC32-NEXT:    sw s7, 204(sp)
+; RV32ZVBC32-NEXT:    sw zero, 192(sp)
+; RV32ZVBC32-NEXT:    sw s6, 196(sp)
+; RV32ZVBC32-NEXT:    sw zero, 184(sp)
+; RV32ZVBC32-NEXT:    sw s5, 188(sp)
+; RV32ZVBC32-NEXT:    slli ra, ra, 11
+; RV32ZVBC32-NEXT:    sw zero, 176(sp)
+; RV32ZVBC32-NEXT:    sw ra, 180(sp)
+; RV32ZVBC32-NEXT:    sw zero, 168(sp)
+; RV32ZVBC32-NEXT:    sw s4, 172(sp)
+; RV32ZVBC32-NEXT:    sw zero, 160(sp)
+; RV32ZVBC32-NEXT:    sw s3, 164(sp)
+; RV32ZVBC32-NEXT:    sw zero, 152(sp)
+; RV32ZVBC32-NEXT:    sw s2, 156(sp)
+; RV32ZVBC32-NEXT:    sw zero, 144(sp)
+; RV32ZVBC32-NEXT:    sw s1, 148(sp)
+; RV32ZVBC32-NEXT:    sw zero, 136(sp)
+; RV32ZVBC32-NEXT:    sw t6, 140(sp)
+; RV32ZVBC32-NEXT:    sw zero, 128(sp)
+; RV32ZVBC32-NEXT:    sw a0, 132(sp)
+; RV32ZVBC32-NEXT:    sw zero, 120(sp)
+; RV32ZVBC32-NEXT:    sw a1, 124(sp)
+; RV32ZVBC32-NEXT:    sw zero, 112(sp)
+; RV32ZVBC32-NEXT:    sw a2, 116(sp)
+; RV32ZVBC32-NEXT:    sw zero, 104(sp)
+; RV32ZVBC32-NEXT:    sw a3, 108(sp)
+; RV32ZVBC32-NEXT:    sw zero, 96(sp)
+; RV32ZVBC32-NEXT:    sw t1, 100(sp)
+; RV32ZVBC32-NEXT:    sw zero, 88(sp)
+; RV32ZVBC32-NEXT:    sw t0, 92(sp)
+; RV32ZVBC32-NEXT:    sw zero, 80(sp)
+; RV32ZVBC32-NEXT:    sw t2, 84(sp)
+; RV32ZVBC32-NEXT:    sw zero, 72(sp)
+; RV32ZVBC32-NEXT:    sw a5, 76(sp)
+; RV32ZVBC32-NEXT:    sw zero, 64(sp)
+; RV32ZVBC32-NEXT:    sw a6, 68(sp)
+; RV32ZVBC32-NEXT:    lui a2, 8192
+; RV32ZVBC32-NEXT:    sw zero, 56(sp)
+; RV32ZVBC32-NEXT:    sw a7, 60(sp)
+; RV32ZVBC32-NEXT:    lui t2, 16384
+; RV32ZVBC32-NEXT:    sw zero, 48(sp)
+; RV32ZVBC32-NEXT:    sw s0, 52(sp)
+; RV32ZVBC32-NEXT:    lui a7, 65536
+; RV32ZVBC32-NEXT:    sw zero, 40(sp)
+; RV32ZVBC32-NEXT:    sw a7, 44(sp)
+; RV32ZVBC32-NEXT:    lui a6, 131072
+; RV32ZVBC32-NEXT:    sw zero, 32(sp)
+; RV32ZVBC32-NEXT:    sw a6, 36(sp)
+; RV32ZVBC32-NEXT:    lui a5, 262144
+; RV32ZVBC32-NEXT:    sw zero, 24(sp)
+; RV32ZVBC32-NEXT:    sw a5, 28(sp)
+; RV32ZVBC32-NEXT:    sw a4, 20(sp)
+; RV32ZVBC32-NEXT:    sw zero, 16(sp)
+; RV32ZVBC32-NEXT:    lui a3, 4080
+; RV32ZVBC32-NEXT:    addi t0, sp, 248
+; RV32ZVBC32-NEXT:    vlse64.v v24, (t0), zero
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vsrl.vi v8, v8, 24
+; RV32ZVBC32-NEXT:    vand.vx v8, v8, a3
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vsrl.vi v0, v0, 8
+; RV32ZVBC32-NEXT:    vand.vv v0, v0, v24
+; RV32ZVBC32-NEXT:    vor.vv v8, v0, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vsrl.vi v0, v16, 24
+; RV32ZVBC32-NEXT:    vand.vx v0, v0, a3
+; RV32ZVBC32-NEXT:    vsrl.vi v24, v16, 8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v24, v16
+; RV32ZVBC32-NEXT:    vor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 7
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 7
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vx v0, v16, a3
+; RV32ZVBC32-NEXT:    vsll.vi v0, v0, 24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v16, v16, v8
+; RV32ZVBC32-NEXT:    vsll.vi v16, v16, 8
+; RV32ZVBC32-NEXT:    vor.vv v0, v0, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vor.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vx v24, v8, a3
+; RV32ZVBC32-NEXT:    vsll.vi v24, v24, 24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    vsll.vi v8, v8, 8
+; RV32ZVBC32-NEXT:    vor.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vor.vv v8, v0, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 7
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    lui t0, 61681
+; RV32ZVBC32-NEXT:    addi t0, t0, -241
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vor.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vsetvli t1, zero, e32, m8, ta, ma
+; RV32ZVBC32-NEXT:    vmv.v.x v0, t0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v0, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vsetvli t0, zero, e64, m8, ta, ma
+; RV32ZVBC32-NEXT:    vsrl.vi v8, v24, 4
+; RV32ZVBC32-NEXT:    vand.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vand.vv v8, v8, v0
+; RV32ZVBC32-NEXT:    vsll.vi v24, v24, 4
+; RV32ZVBC32-NEXT:    vor.vv v8, v8, v24
+; RV32ZVBC32-NEXT:    vsrl.vi v24, v16, 4
+; RV32ZVBC32-NEXT:    vand.vv v16, v16, v0
+; RV32ZVBC32-NEXT:    vand.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vsll.vi v16, v16, 4
+; RV32ZVBC32-NEXT:    vor.vv v16, v24, v16
+; RV32ZVBC32-NEXT:    lui t0, 209715
+; RV32ZVBC32-NEXT:    addi t0, t0, 819
+; RV32ZVBC32-NEXT:    vsetvli t1, zero, e32, m8, ta, ma
+; RV32ZVBC32-NEXT:    vmv.v.x v0, t0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 7
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v0, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vsetvli t0, zero, e64, m8, ta, ma
+; RV32ZVBC32-NEXT:    vsrl.vi v24, v8, 2
+; RV32ZVBC32-NEXT:    vand.vv v8, v8, v0
+; RV32ZVBC32-NEXT:    vand.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vsll.vi v8, v8, 2
+; RV32ZVBC32-NEXT:    vor.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    vsrl.vi v24, v16, 2
+; RV32ZVBC32-NEXT:    vand.vv v16, v16, v0
+; RV32ZVBC32-NEXT:    vand.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vsll.vi v16, v16, 2
+; RV32ZVBC32-NEXT:    vor.vv v24, v24, v16
+; RV32ZVBC32-NEXT:    lui t0, 349525
+; RV32ZVBC32-NEXT:    addi t0, t0, 1365
+; RV32ZVBC32-NEXT:    vsetvli t1, zero, e32, m8, ta, ma
+; RV32ZVBC32-NEXT:    vmv.v.x v0, t0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v0, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vsetvli t0, zero, e64, m8, ta, ma
+; RV32ZVBC32-NEXT:    vsrl.vi v16, v8, 1
+; RV32ZVBC32-NEXT:    vand.vv v8, v8, v0
+; RV32ZVBC32-NEXT:    vand.vv v16, v16, v0
+; RV32ZVBC32-NEXT:    vadd.vv v8, v8, v8
+; RV32ZVBC32-NEXT:    vor.vv v16, v16, v8
+; RV32ZVBC32-NEXT:    vsrl.vi v8, v24, 1
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v8, v24, v0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi t0, sp, 8
+; RV32ZVBC32-NEXT:    vlse64.v v0, (t0), zero
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vadd.vv v24, v24, v24
+; RV32ZVBC32-NEXT:    vor.vv v8, v8, v24
+; RV32ZVBC32-NEXT:    vand.vv v24, v8, v0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi t0, sp, 272
+; RV32ZVBC32-NEXT:    addi t1, sp, 256
+; RV32ZVBC32-NEXT:    addi a1, sp, 264
+; RV32ZVBC32-NEXT:    addi a0, sp, 240
+; RV32ZVBC32-NEXT:    vlse64.v v24, (t0), zero
+; RV32ZVBC32-NEXT:    csrr a4, vlenb
+; RV32ZVBC32-NEXT:    slli a4, a4, 4
+; RV32ZVBC32-NEXT:    mv t0, a4
+; RV32ZVBC32-NEXT:    slli a4, a4, 1
+; RV32ZVBC32-NEXT:    add t0, t0, a4
+; RV32ZVBC32-NEXT:    slli a4, a4, 1
+; RV32ZVBC32-NEXT:    add t0, t0, a4
+; RV32ZVBC32-NEXT:    slli a4, a4, 3
+; RV32ZVBC32-NEXT:    add a4, a4, t0
+; RV32ZVBC32-NEXT:    add a4, sp, a4
+; RV32ZVBC32-NEXT:    addi a4, a4, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a4) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v0, (t1), zero
+; RV32ZVBC32-NEXT:    vlse64.v v24, (a1), zero
+; RV32ZVBC32-NEXT:    csrr a1, vlenb
+; RV32ZVBC32-NEXT:    slli a1, a1, 5
+; RV32ZVBC32-NEXT:    mv a4, a1
+; RV32ZVBC32-NEXT:    slli a1, a1, 1
+; RV32ZVBC32-NEXT:    add a4, a4, a1
+; RV32ZVBC32-NEXT:    slli a1, a1, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a4
+; RV32ZVBC32-NEXT:    add a1, sp, a1
+; RV32ZVBC32-NEXT:    addi a1, a1, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v24, (a0), zero
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v24, v8, v0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi a0, sp, 232
+; RV32ZVBC32-NEXT:    addi a1, sp, 224
+; RV32ZVBC32-NEXT:    addi t0, sp, 216
+; RV32ZVBC32-NEXT:    addi t1, sp, 208
+; RV32ZVBC32-NEXT:    vlse64.v v24, (a0), zero
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a4, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a4, a4, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a4
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v0, (a1), zero
+; RV32ZVBC32-NEXT:    vlse64.v v24, (t0), zero
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v24, (t1), zero
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v24, v8, v0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi a0, sp, 200
+; RV32ZVBC32-NEXT:    addi a1, sp, 192
+; RV32ZVBC32-NEXT:    addi t0, sp, 184
+; RV32ZVBC32-NEXT:    addi t1, sp, 176
+; RV32ZVBC32-NEXT:    vlse64.v v24, (a0), zero
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a4, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a4, a4, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a4
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v0, (a1), zero
+; RV32ZVBC32-NEXT:    vlse64.v v24, (t0), zero
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v24, (t1), zero
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v24, v8, v0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi a0, sp, 168
+; RV32ZVBC32-NEXT:    addi a1, sp, 160
+; RV32ZVBC32-NEXT:    addi t0, sp, 152
+; RV32ZVBC32-NEXT:    addi t1, sp, 144
+; RV32ZVBC32-NEXT:    vlse64.v v24, (a0), zero
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a4, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, a0, a4
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v0, (a1), zero
+; RV32ZVBC32-NEXT:    vlse64.v v24, (t0), zero
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 9
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v24, (t1), zero
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v24, v8, v0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 9
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 9
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi a0, sp, 136
+; RV32ZVBC32-NEXT:    addi a1, sp, 128
+; RV32ZVBC32-NEXT:    addi t0, sp, 120
+; RV32ZVBC32-NEXT:    addi t1, sp, 112
+; RV32ZVBC32-NEXT:    vlse64.v v24, (a0), zero
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a4, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a4, a4, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a4, a4, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a4, a4, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a4
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v0, (a1), zero
+; RV32ZVBC32-NEXT:    vlse64.v v24, (t0), zero
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v24, (t1), zero
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v24, v8, v0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi a0, sp, 104
+; RV32ZVBC32-NEXT:    addi a1, sp, 96
+; RV32ZVBC32-NEXT:    addi t0, sp, 88
+; RV32ZVBC32-NEXT:    addi t1, sp, 80
+; RV32ZVBC32-NEXT:    vlse64.v v24, (a0), zero
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a4, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a4, a4, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a4, a4, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a4
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v0, (a1), zero
+; RV32ZVBC32-NEXT:    vlse64.v v24, (t0), zero
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v24, (t1), zero
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v24, v8, v0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi a0, sp, 72
+; RV32ZVBC32-NEXT:    addi a1, sp, 64
+; RV32ZVBC32-NEXT:    addi t0, sp, 56
+; RV32ZVBC32-NEXT:    addi t1, sp, 48
+; RV32ZVBC32-NEXT:    vlse64.v v24, (a0), zero
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a4, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a4, a4, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a4, a4, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a4
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v0, (a1), zero
+; RV32ZVBC32-NEXT:    vlse64.v v24, (t0), zero
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v24, (t1), zero
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v24, v8, v0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi a0, sp, 40
+; RV32ZVBC32-NEXT:    addi a1, sp, 32
+; RV32ZVBC32-NEXT:    addi t0, sp, 24
+; RV32ZVBC32-NEXT:    addi t1, sp, 16
+; RV32ZVBC32-NEXT:    vlse64.v v24, (a0), zero
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a4, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a4, a4, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a4
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v0, (a1), zero
+; RV32ZVBC32-NEXT:    vlse64.v v24, (t0), zero
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 7
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v24, (t1), zero
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v24, v8, v0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 7
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 7
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v24, v8, s11
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v24, v8, s10
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v24, v8, s9
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v24, v8, s8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v24, v8, s7
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v24, v8, s6
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v24, v8, s5
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v24, v8, ra
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v24, v8, s4
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v24, v8, s3
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v24, v8, s2
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v24, v8, s1
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v24, v8, t6
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    lui a0, 32
+; RV32ZVBC32-NEXT:    vand.vx v24, v8, a0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    lui a0, 64
+; RV32ZVBC32-NEXT:    vand.vx v24, v8, a0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 8
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    lui a0, 128
+; RV32ZVBC32-NEXT:    vand.vx v24, v8, a0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    lui a0, 256
+; RV32ZVBC32-NEXT:    vand.vx v24, v8, a0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    lui a0, 512
+; RV32ZVBC32-NEXT:    vand.vx v24, v8, a0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    lui a0, 1024
+; RV32ZVBC32-NEXT:    vand.vx v24, v8, a0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    lui a0, 2048
+; RV32ZVBC32-NEXT:    vand.vx v24, v8, a0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    lui a0, 4096
+; RV32ZVBC32-NEXT:    vand.vx v24, v8, a0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v24, v8, a2
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v24, v8, t2
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v24, v8, s0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v24, v8, a7
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v24, v8, a6
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v24, v8, a5
+; RV32ZVBC32-NEXT:    addi a0, sp, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vi v24, v8, 2
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vi v0, v8, 1
+; RV32ZVBC32-NEXT:    vand.vi v24, v8, 4
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vi v8, v8, 8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v24, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vmul.vv v24, v16, v0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v0, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 7
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 8
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi a0, sp, 288
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 8
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 9
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 9
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 7
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 7
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v24, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vsll.vx v8, v8, t5
+; RV32ZVBC32-NEXT:    vand.vx v16, v16, t3
+; RV32ZVBC32-NEXT:    vsll.vx v16, v16, t4
+; RV32ZVBC32-NEXT:    vor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v24, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 8
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 9
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v0, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v0, v0, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v0, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vsrl.vi v0, v24, 8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v0, v0, v16
+; RV32ZVBC32-NEXT:    vsrl.vi v8, v8, 24
+; RV32ZVBC32-NEXT:    vand.vx v8, v8, a3
+; RV32ZVBC32-NEXT:    vor.vv v8, v0, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v0
+; RV32ZVBC32-NEXT:    vand.vx v24, v24, a3
+; RV32ZVBC32-NEXT:    vsll.vi v24, v24, 24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vsll.vi v0, v0, 8
+; RV32ZVBC32-NEXT:    vor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v0, v16, v0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v24, v0, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vsrl.vx v8, v8, t4
+; RV32ZVBC32-NEXT:    vand.vx v8, v8, t3
+; RV32ZVBC32-NEXT:    vsrl.vx v24, v24, t5
+; RV32ZVBC32-NEXT:    vor.vv v8, v8, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vor.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    vor.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    vsrl.vi v16, v8, 4
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v8, v8, v24
+; RV32ZVBC32-NEXT:    vand.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    vsll.vi v8, v8, 4
+; RV32ZVBC32-NEXT:    vor.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    vsrl.vi v16, v8, 2
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 7
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v8, v8, v24
+; RV32ZVBC32-NEXT:    vand.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    vsll.vi v8, v8, 2
+; RV32ZVBC32-NEXT:    vor.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    vsrl.vi v16, v8, 1
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 288
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v8, v8, v24
+; RV32ZVBC32-NEXT:    vand.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    vadd.vv v8, v8, v8
+; RV32ZVBC32-NEXT:    vor.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    vsrl.vi v8, v8, 1
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add sp, sp, a0
+; RV32ZVBC32-NEXT:    lw ra, 348(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s0, 344(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s1, 340(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s2, 336(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s3, 332(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s4, 328(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s5, 324(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s6, 320(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s7, 316(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s8, 312(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s9, 308(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s10, 304(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s11, 300(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    addi sp, sp, 352
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmulh_nxv8i64_vv:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    addi sp, sp, -416
+; RV64ZVBC32-NEXT:    sd ra, 408(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    sd s0, 400(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    sd s1, 392(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    sd s2, 384(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    sd s3, 376(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    sd s4, 368(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    sd s5, 360(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    sd s6, 352(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    sd s7, 344(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    sd s8, 336(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    sd s9, 328(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    sd s10, 320(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    sd s11, 312(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 5
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    sub sp, sp, a0
+; RV64ZVBC32-NEXT:    li a1, 56
+; RV64ZVBC32-NEXT:    li a2, 40
+; RV64ZVBC32-NEXT:    lui a3, 16
+; RV64ZVBC32-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV64ZVBC32-NEXT:    vsrl.vx v24, v8, a1
+; RV64ZVBC32-NEXT:    vsrl.vx v0, v8, a2
+; RV64ZVBC32-NEXT:    addi a2, a3, -256
+; RV64ZVBC32-NEXT:    vand.vx v0, v0, a2
+; RV64ZVBC32-NEXT:    vor.vv v24, v0, v24
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    vsrl.vi v24, v8, 24
+; RV64ZVBC32-NEXT:    lui a1, 4080
+; RV64ZVBC32-NEXT:    li s4, 255
+; RV64ZVBC32-NEXT:    vand.vx v24, v24, a1
+; RV64ZVBC32-NEXT:    slli s4, s4, 24
+; RV64ZVBC32-NEXT:    vsrl.vi v0, v8, 8
+; RV64ZVBC32-NEXT:    vand.vx v0, v0, s4
+; RV64ZVBC32-NEXT:    vor.vv v24, v0, v24
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    li a1, 40
+; RV64ZVBC32-NEXT:    vsrl.vx v24, v16, a1
+; RV64ZVBC32-NEXT:    sd a2, 280(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v24, v24, a2
+; RV64ZVBC32-NEXT:    li a0, 56
+; RV64ZVBC32-NEXT:    vsrl.vx v0, v16, a0
+; RV64ZVBC32-NEXT:    vor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    csrr a3, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a3, 3
+; RV64ZVBC32-NEXT:    mv a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 4
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 2
+; RV64ZVBC32-NEXT:    add a3, a3, a4
+; RV64ZVBC32-NEXT:    add a3, sp, a3
+; RV64ZVBC32-NEXT:    addi a3, a3, 304
+; RV64ZVBC32-NEXT:    vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    vsrl.vi v24, v16, 24
+; RV64ZVBC32-NEXT:    lui a3, 4080
+; RV64ZVBC32-NEXT:    vand.vx v24, v24, a3
+; RV64ZVBC32-NEXT:    vsrl.vi v0, v16, 8
+; RV64ZVBC32-NEXT:    vand.vx v0, v0, s4
+; RV64ZVBC32-NEXT:    vor.vv v24, v0, v24
+; RV64ZVBC32-NEXT:    csrr a4, vlenb
+; RV64ZVBC32-NEXT:    slli a4, a4, 3
+; RV64ZVBC32-NEXT:    mv a5, a4
+; RV64ZVBC32-NEXT:    slli a4, a4, 1
+; RV64ZVBC32-NEXT:    add a5, a5, a4
+; RV64ZVBC32-NEXT:    slli a4, a4, 1
+; RV64ZVBC32-NEXT:    add a5, a5, a4
+; RV64ZVBC32-NEXT:    slli a4, a4, 1
+; RV64ZVBC32-NEXT:    add a5, a5, a4
+; RV64ZVBC32-NEXT:    slli a4, a4, 3
+; RV64ZVBC32-NEXT:    add a4, a4, a5
+; RV64ZVBC32-NEXT:    add a4, sp, a4
+; RV64ZVBC32-NEXT:    addi a4, a4, 304
+; RV64ZVBC32-NEXT:    vs8r.v v24, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a4, vlenb
+; RV64ZVBC32-NEXT:    slli a4, a4, 3
+; RV64ZVBC32-NEXT:    mv a5, a4
+; RV64ZVBC32-NEXT:    slli a4, a4, 1
+; RV64ZVBC32-NEXT:    add a5, a5, a4
+; RV64ZVBC32-NEXT:    slli a4, a4, 3
+; RV64ZVBC32-NEXT:    add a5, a5, a4
+; RV64ZVBC32-NEXT:    slli a4, a4, 2
+; RV64ZVBC32-NEXT:    add a4, a4, a5
+; RV64ZVBC32-NEXT:    add a4, sp, a4
+; RV64ZVBC32-NEXT:    addi a4, a4, 304
+; RV64ZVBC32-NEXT:    vl8r.v v24, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    csrr a4, vlenb
+; RV64ZVBC32-NEXT:    slli a4, a4, 4
+; RV64ZVBC32-NEXT:    mv a5, a4
+; RV64ZVBC32-NEXT:    slli a4, a4, 3
+; RV64ZVBC32-NEXT:    add a5, a5, a4
+; RV64ZVBC32-NEXT:    slli a4, a4, 2
+; RV64ZVBC32-NEXT:    add a4, a4, a5
+; RV64ZVBC32-NEXT:    add a4, sp, a4
+; RV64ZVBC32-NEXT:    addi a4, a4, 304
+; RV64ZVBC32-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vor.vv v24, v0, v24
+; RV64ZVBC32-NEXT:    csrr a4, vlenb
+; RV64ZVBC32-NEXT:    slli a4, a4, 3
+; RV64ZVBC32-NEXT:    mv a5, a4
+; RV64ZVBC32-NEXT:    slli a4, a4, 1
+; RV64ZVBC32-NEXT:    add a5, a5, a4
+; RV64ZVBC32-NEXT:    slli a4, a4, 3
+; RV64ZVBC32-NEXT:    add a5, a5, a4
+; RV64ZVBC32-NEXT:    slli a4, a4, 2
+; RV64ZVBC32-NEXT:    add a4, a4, a5
+; RV64ZVBC32-NEXT:    add a4, sp, a4
+; RV64ZVBC32-NEXT:    addi a4, a4, 304
+; RV64ZVBC32-NEXT:    vs8r.v v24, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v0, v8, a3
+; RV64ZVBC32-NEXT:    vsll.vi v0, v0, 24
+; RV64ZVBC32-NEXT:    vand.vx v24, v8, s4
+; RV64ZVBC32-NEXT:    sd s4, 288(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vsll.vi v24, v24, 8
+; RV64ZVBC32-NEXT:    vor.vv v24, v0, v24
+; RV64ZVBC32-NEXT:    csrr a4, vlenb
+; RV64ZVBC32-NEXT:    slli a4, a4, 7
+; RV64ZVBC32-NEXT:    mv a5, a4
+; RV64ZVBC32-NEXT:    slli a4, a4, 2
+; RV64ZVBC32-NEXT:    add a4, a4, a5
+; RV64ZVBC32-NEXT:    add a4, sp, a4
+; RV64ZVBC32-NEXT:    addi a4, a4, 304
+; RV64ZVBC32-NEXT:    vs8r.v v24, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    vsll.vx v24, v8, a0
+; RV64ZVBC32-NEXT:    li a0, 56
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, a2
+; RV64ZVBC32-NEXT:    vsll.vx v8, v8, a1
+; RV64ZVBC32-NEXT:    vor.vv v8, v24, v8
+; RV64ZVBC32-NEXT:    csrr a4, vlenb
+; RV64ZVBC32-NEXT:    slli a4, a4, 3
+; RV64ZVBC32-NEXT:    mv a5, a4
+; RV64ZVBC32-NEXT:    slli a4, a4, 4
+; RV64ZVBC32-NEXT:    add a5, a5, a4
+; RV64ZVBC32-NEXT:    slli a4, a4, 2
+; RV64ZVBC32-NEXT:    add a4, a4, a5
+; RV64ZVBC32-NEXT:    add a4, sp, a4
+; RV64ZVBC32-NEXT:    addi a4, a4, 304
+; RV64ZVBC32-NEXT:    vl8r.v v24, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    csrr a4, vlenb
+; RV64ZVBC32-NEXT:    slli a4, a4, 3
+; RV64ZVBC32-NEXT:    mv a5, a4
+; RV64ZVBC32-NEXT:    slli a4, a4, 1
+; RV64ZVBC32-NEXT:    add a5, a5, a4
+; RV64ZVBC32-NEXT:    slli a4, a4, 1
+; RV64ZVBC32-NEXT:    add a5, a5, a4
+; RV64ZVBC32-NEXT:    slli a4, a4, 1
+; RV64ZVBC32-NEXT:    add a5, a5, a4
+; RV64ZVBC32-NEXT:    slli a4, a4, 3
+; RV64ZVBC32-NEXT:    add a4, a4, a5
+; RV64ZVBC32-NEXT:    add a4, sp, a4
+; RV64ZVBC32-NEXT:    addi a4, a4, 304
+; RV64ZVBC32-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vor.vv v24, v0, v24
+; RV64ZVBC32-NEXT:    csrr a4, vlenb
+; RV64ZVBC32-NEXT:    slli a4, a4, 4
+; RV64ZVBC32-NEXT:    mv a5, a4
+; RV64ZVBC32-NEXT:    slli a4, a4, 3
+; RV64ZVBC32-NEXT:    add a5, a5, a4
+; RV64ZVBC32-NEXT:    slli a4, a4, 2
+; RV64ZVBC32-NEXT:    add a4, a4, a5
+; RV64ZVBC32-NEXT:    add a4, sp, a4
+; RV64ZVBC32-NEXT:    addi a4, a4, 304
+; RV64ZVBC32-NEXT:    vs8r.v v24, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v24, v16, a3
+; RV64ZVBC32-NEXT:    vsll.vi v24, v24, 24
+; RV64ZVBC32-NEXT:    vand.vx v0, v16, s4
+; RV64ZVBC32-NEXT:    vsll.vi v0, v0, 8
+; RV64ZVBC32-NEXT:    vor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vsll.vx v0, v16, a0
+; RV64ZVBC32-NEXT:    vand.vx v16, v16, a2
+; RV64ZVBC32-NEXT:    vsll.vx v16, v16, a1
+; RV64ZVBC32-NEXT:    vor.vv v16, v0, v16
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 7
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vor.vv v8, v8, v0
+; RV64ZVBC32-NEXT:    vor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    lui a0, 61681
+; RV64ZVBC32-NEXT:    lui a1, 209715
+; RV64ZVBC32-NEXT:    lui a2, 349525
+; RV64ZVBC32-NEXT:    li a4, 16
+; RV64ZVBC32-NEXT:    li a3, 32
+; RV64ZVBC32-NEXT:    li t2, 1
+; RV64ZVBC32-NEXT:    addi a7, a0, -241
+; RV64ZVBC32-NEXT:    addi t0, a1, 819
+; RV64ZVBC32-NEXT:    addi t1, a2, 1365
+; RV64ZVBC32-NEXT:    slli a0, t2, 11
+; RV64ZVBC32-NEXT:    sd a0, 248(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli a0, t2, 31
+; RV64ZVBC32-NEXT:    sd a0, 240(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli a0, t2, 32
+; RV64ZVBC32-NEXT:    sd a0, 232(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli a0, t2, 33
+; RV64ZVBC32-NEXT:    sd a0, 224(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli a0, t2, 34
+; RV64ZVBC32-NEXT:    sd a0, 216(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli a0, t2, 35
+; RV64ZVBC32-NEXT:    sd a0, 208(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli a0, t2, 36
+; RV64ZVBC32-NEXT:    sd a0, 200(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli a0, t2, 37
+; RV64ZVBC32-NEXT:    sd a0, 192(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli a0, t2, 38
+; RV64ZVBC32-NEXT:    sd a0, 184(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli a0, t2, 39
+; RV64ZVBC32-NEXT:    sd a0, 176(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli a0, t2, 40
+; RV64ZVBC32-NEXT:    sd a0, 168(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli a0, t2, 41
+; RV64ZVBC32-NEXT:    sd a0, 160(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli a0, t2, 42
+; RV64ZVBC32-NEXT:    sd a0, 152(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli a0, t2, 43
+; RV64ZVBC32-NEXT:    sd a0, 144(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli a0, t2, 44
+; RV64ZVBC32-NEXT:    sd a0, 136(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli a0, a7, 32
+; RV64ZVBC32-NEXT:    add a7, a7, a0
+; RV64ZVBC32-NEXT:    slli a0, t0, 32
+; RV64ZVBC32-NEXT:    add t0, t0, a0
+; RV64ZVBC32-NEXT:    slli a0, t1, 32
+; RV64ZVBC32-NEXT:    add a0, t1, a0
+; RV64ZVBC32-NEXT:    sd a0, 272(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli a1, t2, 45
+; RV64ZVBC32-NEXT:    sd a1, 128(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a1, vlenb
+; RV64ZVBC32-NEXT:    slli a1, a1, 4
+; RV64ZVBC32-NEXT:    mv a2, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 3
+; RV64ZVBC32-NEXT:    add a2, a2, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a2
+; RV64ZVBC32-NEXT:    add a1, sp, a1
+; RV64ZVBC32-NEXT:    addi a1, a1, 304
+; RV64ZVBC32-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vsrl.vi v24, v16, 4
+; RV64ZVBC32-NEXT:    vand.vx v16, v16, a7
+; RV64ZVBC32-NEXT:    vand.vx v24, v24, a7
+; RV64ZVBC32-NEXT:    sd a7, 256(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vsll.vi v16, v16, 4
+; RV64ZVBC32-NEXT:    vor.vv v16, v24, v16
+; RV64ZVBC32-NEXT:    vsrl.vi v24, v16, 2
+; RV64ZVBC32-NEXT:    vand.vx v16, v16, t0
+; RV64ZVBC32-NEXT:    vand.vx v24, v24, t0
+; RV64ZVBC32-NEXT:    sd t0, 264(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vsll.vi v16, v16, 2
+; RV64ZVBC32-NEXT:    vor.vv v16, v24, v16
+; RV64ZVBC32-NEXT:    vsrl.vi v24, v16, 1
+; RV64ZVBC32-NEXT:    vand.vx v16, v16, a0
+; RV64ZVBC32-NEXT:    vand.vx v24, v24, a0
+; RV64ZVBC32-NEXT:    vadd.vv v16, v16, v16
+; RV64ZVBC32-NEXT:    vor.vv v0, v24, v16
+; RV64ZVBC32-NEXT:    vand.vx v16, v0, a4
+; RV64ZVBC32-NEXT:    csrr a1, vlenb
+; RV64ZVBC32-NEXT:    slli a1, a1, 4
+; RV64ZVBC32-NEXT:    mv a2, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 3
+; RV64ZVBC32-NEXT:    add a2, a2, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a2
+; RV64ZVBC32-NEXT:    add a1, sp, a1
+; RV64ZVBC32-NEXT:    addi a1, a1, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli a1, t2, 46
+; RV64ZVBC32-NEXT:    sd a1, 120(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a1, vlenb
+; RV64ZVBC32-NEXT:    slli a1, a1, 3
+; RV64ZVBC32-NEXT:    mv a2, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    add a2, a2, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 3
+; RV64ZVBC32-NEXT:    add a2, a2, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a2
+; RV64ZVBC32-NEXT:    add a1, sp, a1
+; RV64ZVBC32-NEXT:    addi a1, a1, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vor.vv v8, v8, v16
+; RV64ZVBC32-NEXT:    vsrl.vi v16, v8, 4
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, a7
+; RV64ZVBC32-NEXT:    vand.vx v16, v16, a7
+; RV64ZVBC32-NEXT:    vsll.vi v8, v8, 4
+; RV64ZVBC32-NEXT:    vor.vv v8, v16, v8
+; RV64ZVBC32-NEXT:    vsrl.vi v16, v8, 2
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, t0
+; RV64ZVBC32-NEXT:    vand.vx v16, v16, t0
+; RV64ZVBC32-NEXT:    vsll.vi v8, v8, 2
+; RV64ZVBC32-NEXT:    vor.vv v8, v16, v8
+; RV64ZVBC32-NEXT:    vsrl.vi v16, v8, 1
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, a0
+; RV64ZVBC32-NEXT:    vand.vx v16, v16, a0
+; RV64ZVBC32-NEXT:    vadd.vv v8, v8, v8
+; RV64ZVBC32-NEXT:    vor.vv v8, v16, v8
+; RV64ZVBC32-NEXT:    vand.vx v16, v0, a3
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli a0, t2, 47
+; RV64ZVBC32-NEXT:    sd a0, 112(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli a0, t2, 48
+; RV64ZVBC32-NEXT:    sd a0, 104(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli a0, t2, 49
+; RV64ZVBC32-NEXT:    sd a0, 96(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli a0, t2, 50
+; RV64ZVBC32-NEXT:    sd a0, 88(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli a0, t2, 51
+; RV64ZVBC32-NEXT:    sd a0, 80(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli a0, t2, 52
+; RV64ZVBC32-NEXT:    sd a0, 72(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli a0, t2, 53
+; RV64ZVBC32-NEXT:    sd a0, 64(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli a0, t2, 54
+; RV64ZVBC32-NEXT:    sd a0, 56(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli a0, t2, 55
+; RV64ZVBC32-NEXT:    sd a0, 48(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli a0, t2, 56
+; RV64ZVBC32-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli a0, t2, 57
+; RV64ZVBC32-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli ra, t2, 58
+; RV64ZVBC32-NEXT:    slli s10, t2, 59
+; RV64ZVBC32-NEXT:    slli s8, t2, 60
+; RV64ZVBC32-NEXT:    slli a0, t2, 61
+; RV64ZVBC32-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli s11, t2, 62
+; RV64ZVBC32-NEXT:    li a1, -1
+; RV64ZVBC32-NEXT:    slli s9, a1, 63
+; RV64ZVBC32-NEXT:    li a0, 64
+; RV64ZVBC32-NEXT:    li a1, 128
+; RV64ZVBC32-NEXT:    li a2, 256
+; RV64ZVBC32-NEXT:    li a3, 512
+; RV64ZVBC32-NEXT:    li a4, 1024
+; RV64ZVBC32-NEXT:    lui a5, 1
+; RV64ZVBC32-NEXT:    lui a6, 2
+; RV64ZVBC32-NEXT:    lui a7, 4
+; RV64ZVBC32-NEXT:    lui t0, 8
+; RV64ZVBC32-NEXT:    lui t1, 32
+; RV64ZVBC32-NEXT:    lui t2, 64
+; RV64ZVBC32-NEXT:    lui t3, 128
+; RV64ZVBC32-NEXT:    lui t4, 256
+; RV64ZVBC32-NEXT:    lui t5, 512
+; RV64ZVBC32-NEXT:    lui t6, 1024
+; RV64ZVBC32-NEXT:    lui s0, 2048
+; RV64ZVBC32-NEXT:    lui s1, 4096
+; RV64ZVBC32-NEXT:    lui s2, 8192
+; RV64ZVBC32-NEXT:    lui s3, 16384
+; RV64ZVBC32-NEXT:    lui s4, 32768
+; RV64ZVBC32-NEXT:    lui s5, 65536
+; RV64ZVBC32-NEXT:    lui s6, 131072
+; RV64ZVBC32-NEXT:    lui s7, 262144
+; RV64ZVBC32-NEXT:    vand.vx v16, v0, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    sd s8, 8(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    mv s8, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add s8, s8, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, s8
+; RV64ZVBC32-NEXT:    ld s8, 8(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v16, v0, a1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 7
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v16, v0, a2
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v16, v0, a3
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v16, v0, a4
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld a0, 248(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v16, v0, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 5
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v16, v0, a5
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v16, v0, a6
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v16, v0, a7
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v16, v0, t0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 6
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    lui a0, 16
+; RV64ZVBC32-NEXT:    vand.vx v16, v0, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v16, v0, t1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v16, v0, t2
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v16, v0, t3
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 5
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v16, v0, t4
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 5
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v16, v0, t5
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 5
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v16, v0, t6
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 6
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v16, v0, s0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 9
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v16, v0, s1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v16, v0, s2
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v16, v0, s3
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v16, v0, s4
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 5
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v16, v0, s5
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v16, v0, s6
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v16, v0, s7
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld a0, 240(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v16, v0, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 6
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld a0, 232(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v16, v0, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld a0, 224(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v16, v0, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld a0, 216(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v16, v0, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld a0, 208(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v16, v0, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 5
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld a0, 200(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v16, v0, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld a0, 192(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v16, v0, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld a0, 184(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v16, v0, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld a0, 176(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v16, v0, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 7
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld a0, 168(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v16, v0, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld a0, 160(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v16, v0, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld a0, 152(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v16, v0, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld a0, 144(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v16, v0, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 5
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld a0, 136(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v16, v0, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld a0, 128(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v16, v0, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld a0, 120(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v16, v0, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld a0, 112(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v16, v0, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 6
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld a0, 104(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v16, v0, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld a0, 96(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v16, v0, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld a0, 88(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v16, v0, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld a0, 80(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v16, v0, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 5
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld a0, 72(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v16, v0, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld a0, 64(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v16, v0, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld a0, 56(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v16, v0, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 5
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v16, v0, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 8
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v16, v0, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v16, v0, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v16, v0, ra
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v16, v0, s10
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 5
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v16, v0, s8
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vi v16, v0, 2
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vi v24, v0, 1
+; RV64ZVBC32-NEXT:    vand.vi v16, v0, 4
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vi v16, v0, 8
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 5
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v16, v0, a0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v16, v0, s11
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 6
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v16, v0, s9
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    vmul.vv v16, v8, v24
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v24, v8, v16
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 5
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v0, v8, v16
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 5
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 7
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 7
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 5
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 5
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 6
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 6
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 5
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 5
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 5
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 5
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 6
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 9
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC32-NEXT:    addi a0, sp, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 5
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 5
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 5
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 5
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 6
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 6
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 9
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 5
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 5
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 7
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 6
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 5
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 5
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 6
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 7
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 5
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 5
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 5
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 8
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 6
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 5
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 6
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 7
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v8, v8, v16
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v8, v16, v8
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v24
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 5
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v16, v8, v16
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v8, v16, v8
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 7
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v24
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v24
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v24
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v24
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 5
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v24
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v24
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v24
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v24
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 6
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v24
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v24
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v24
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v24
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 5
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v24
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v24
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v24
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v24, v8, v24
+; RV64ZVBC32-NEXT:    addi a0, sp, 304
+; RV64ZVBC32-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    li a0, 56
+; RV64ZVBC32-NEXT:    vsll.vx v16, v16, a0
+; RV64ZVBC32-NEXT:    ld a1, 280(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, a1
+; RV64ZVBC32-NEXT:    li a2, 40
+; RV64ZVBC32-NEXT:    vsll.vx v8, v8, a2
+; RV64ZVBC32-NEXT:    vor.vv v8, v16, v8
+; RV64ZVBC32-NEXT:    csrr a3, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a3, 3
+; RV64ZVBC32-NEXT:    mv a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 2
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 2
+; RV64ZVBC32-NEXT:    add a3, a3, a4
+; RV64ZVBC32-NEXT:    add a3, sp, a3
+; RV64ZVBC32-NEXT:    addi a3, a3, 304
+; RV64ZVBC32-NEXT:    vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a3, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a3, 4
+; RV64ZVBC32-NEXT:    mv a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 4
+; RV64ZVBC32-NEXT:    add a3, a3, a4
+; RV64ZVBC32-NEXT:    add a3, sp, a3
+; RV64ZVBC32-NEXT:    addi a3, a3, 304
+; RV64ZVBC32-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v8, v24, v8
+; RV64ZVBC32-NEXT:    csrr a3, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a3, 3
+; RV64ZVBC32-NEXT:    mv a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 2
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 4
+; RV64ZVBC32-NEXT:    add a3, a3, a4
+; RV64ZVBC32-NEXT:    add a3, sp, a3
+; RV64ZVBC32-NEXT:    addi a3, a3, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV64ZVBC32-NEXT:    csrr a3, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a3, 5
+; RV64ZVBC32-NEXT:    mv a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 4
+; RV64ZVBC32-NEXT:    add a3, a3, a4
+; RV64ZVBC32-NEXT:    add a3, sp, a3
+; RV64ZVBC32-NEXT:    addi a3, a3, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV64ZVBC32-NEXT:    csrr a3, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a3, 3
+; RV64ZVBC32-NEXT:    mv a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 5
+; RV64ZVBC32-NEXT:    add a3, a3, a4
+; RV64ZVBC32-NEXT:    add a3, sp, a3
+; RV64ZVBC32-NEXT:    addi a3, a3, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV64ZVBC32-NEXT:    csrr a3, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a3, 4
+; RV64ZVBC32-NEXT:    mv a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 5
+; RV64ZVBC32-NEXT:    add a3, a3, a4
+; RV64ZVBC32-NEXT:    add a3, sp, a3
+; RV64ZVBC32-NEXT:    addi a3, a3, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV64ZVBC32-NEXT:    csrr a3, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a3, 3
+; RV64ZVBC32-NEXT:    mv a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 6
+; RV64ZVBC32-NEXT:    add a3, a3, a4
+; RV64ZVBC32-NEXT:    add a3, sp, a3
+; RV64ZVBC32-NEXT:    addi a3, a3, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV64ZVBC32-NEXT:    csrr a3, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a3, 9
+; RV64ZVBC32-NEXT:    add a3, sp, a3
+; RV64ZVBC32-NEXT:    addi a3, a3, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV64ZVBC32-NEXT:    csrr a3, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a3, 3
+; RV64ZVBC32-NEXT:    mv a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a4
+; RV64ZVBC32-NEXT:    add a3, sp, a3
+; RV64ZVBC32-NEXT:    addi a3, a3, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV64ZVBC32-NEXT:    csrr a3, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a3, 4
+; RV64ZVBC32-NEXT:    mv a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a4
+; RV64ZVBC32-NEXT:    add a3, sp, a3
+; RV64ZVBC32-NEXT:    addi a3, a3, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV64ZVBC32-NEXT:    csrr a3, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a3, 3
+; RV64ZVBC32-NEXT:    mv a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 2
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a4
+; RV64ZVBC32-NEXT:    add a3, sp, a3
+; RV64ZVBC32-NEXT:    addi a3, a3, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV64ZVBC32-NEXT:    csrr a3, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a3, 5
+; RV64ZVBC32-NEXT:    mv a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a4
+; RV64ZVBC32-NEXT:    add a3, sp, a3
+; RV64ZVBC32-NEXT:    addi a3, a3, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV64ZVBC32-NEXT:    csrr a3, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a3, 3
+; RV64ZVBC32-NEXT:    mv a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 2
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a4
+; RV64ZVBC32-NEXT:    add a3, sp, a3
+; RV64ZVBC32-NEXT:    addi a3, a3, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV64ZVBC32-NEXT:    csrr a3, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a3, 4
+; RV64ZVBC32-NEXT:    mv a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 2
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a4
+; RV64ZVBC32-NEXT:    add a3, sp, a3
+; RV64ZVBC32-NEXT:    addi a3, a3, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV64ZVBC32-NEXT:    csrr a3, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a3, 3
+; RV64ZVBC32-NEXT:    mv a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 3
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a4
+; RV64ZVBC32-NEXT:    add a3, sp, a3
+; RV64ZVBC32-NEXT:    addi a3, a3, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v24, v8, v16
+; RV64ZVBC32-NEXT:    csrr a3, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a3, 6
+; RV64ZVBC32-NEXT:    mv a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a4
+; RV64ZVBC32-NEXT:    add a3, sp, a3
+; RV64ZVBC32-NEXT:    addi a3, a3, 304
+; RV64ZVBC32-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v8, v24, v8
+; RV64ZVBC32-NEXT:    csrr a3, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a3, 3
+; RV64ZVBC32-NEXT:    mv a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 2
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a4
+; RV64ZVBC32-NEXT:    add a3, sp, a3
+; RV64ZVBC32-NEXT:    addi a3, a3, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV64ZVBC32-NEXT:    csrr a3, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a3, 4
+; RV64ZVBC32-NEXT:    mv a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 2
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a4
+; RV64ZVBC32-NEXT:    add a3, sp, a3
+; RV64ZVBC32-NEXT:    addi a3, a3, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV64ZVBC32-NEXT:    csrr a3, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a3, 3
+; RV64ZVBC32-NEXT:    mv a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 2
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 2
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a4
+; RV64ZVBC32-NEXT:    add a3, sp, a3
+; RV64ZVBC32-NEXT:    addi a3, a3, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV64ZVBC32-NEXT:    csrr a3, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a3, 5
+; RV64ZVBC32-NEXT:    mv a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 2
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a4
+; RV64ZVBC32-NEXT:    add a3, sp, a3
+; RV64ZVBC32-NEXT:    addi a3, a3, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV64ZVBC32-NEXT:    csrr a3, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a3, 3
+; RV64ZVBC32-NEXT:    mv a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 3
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a4
+; RV64ZVBC32-NEXT:    add a3, sp, a3
+; RV64ZVBC32-NEXT:    addi a3, a3, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV64ZVBC32-NEXT:    csrr a3, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a3, 4
+; RV64ZVBC32-NEXT:    mv a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 3
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a4
+; RV64ZVBC32-NEXT:    add a3, sp, a3
+; RV64ZVBC32-NEXT:    addi a3, a3, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV64ZVBC32-NEXT:    csrr a3, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a3, 3
+; RV64ZVBC32-NEXT:    mv a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 4
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a4
+; RV64ZVBC32-NEXT:    add a3, sp, a3
+; RV64ZVBC32-NEXT:    addi a3, a3, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV64ZVBC32-NEXT:    csrr a3, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a3, 7
+; RV64ZVBC32-NEXT:    mv a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a4
+; RV64ZVBC32-NEXT:    add a3, sp, a3
+; RV64ZVBC32-NEXT:    addi a3, a3, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v0, v8, v16
+; RV64ZVBC32-NEXT:    csrr a3, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a3, 3
+; RV64ZVBC32-NEXT:    mv a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 2
+; RV64ZVBC32-NEXT:    add a3, a3, a4
+; RV64ZVBC32-NEXT:    add a3, sp, a3
+; RV64ZVBC32-NEXT:    addi a3, a3, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v0, v0, v16
+; RV64ZVBC32-NEXT:    csrr a3, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a3, 4
+; RV64ZVBC32-NEXT:    mv a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 2
+; RV64ZVBC32-NEXT:    add a3, a3, a4
+; RV64ZVBC32-NEXT:    add a3, sp, a3
+; RV64ZVBC32-NEXT:    addi a3, a3, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v16, v0, v16
+; RV64ZVBC32-NEXT:    vsrl.vi v0, v24, 8
+; RV64ZVBC32-NEXT:    ld a4, 288(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v0, v0, a4
+; RV64ZVBC32-NEXT:    vsrl.vi v8, v8, 24
+; RV64ZVBC32-NEXT:    lui a3, 4080
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, a3
+; RV64ZVBC32-NEXT:    vor.vv v8, v0, v8
+; RV64ZVBC32-NEXT:    csrr a5, vlenb
+; RV64ZVBC32-NEXT:    slli a5, a5, 4
+; RV64ZVBC32-NEXT:    mv a6, a5
+; RV64ZVBC32-NEXT:    slli a5, a5, 1
+; RV64ZVBC32-NEXT:    add a6, a6, a5
+; RV64ZVBC32-NEXT:    slli a5, a5, 4
+; RV64ZVBC32-NEXT:    add a5, a5, a6
+; RV64ZVBC32-NEXT:    add a5, sp, a5
+; RV64ZVBC32-NEXT:    addi a5, a5, 304
+; RV64ZVBC32-NEXT:    vs8r.v v8, (a5) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a5, vlenb
+; RV64ZVBC32-NEXT:    slli a5, a5, 3
+; RV64ZVBC32-NEXT:    mv a6, a5
+; RV64ZVBC32-NEXT:    slli a5, a5, 2
+; RV64ZVBC32-NEXT:    add a6, a6, a5
+; RV64ZVBC32-NEXT:    slli a5, a5, 1
+; RV64ZVBC32-NEXT:    add a6, a6, a5
+; RV64ZVBC32-NEXT:    slli a5, a5, 3
+; RV64ZVBC32-NEXT:    add a5, a5, a6
+; RV64ZVBC32-NEXT:    add a5, sp, a5
+; RV64ZVBC32-NEXT:    addi a5, a5, 304
+; RV64ZVBC32-NEXT:    vl8r.v v8, (a5) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v8, v16, v8
+; RV64ZVBC32-NEXT:    csrr a5, vlenb
+; RV64ZVBC32-NEXT:    slli a5, a5, 5
+; RV64ZVBC32-NEXT:    mv a6, a5
+; RV64ZVBC32-NEXT:    slli a5, a5, 1
+; RV64ZVBC32-NEXT:    add a6, a6, a5
+; RV64ZVBC32-NEXT:    slli a5, a5, 3
+; RV64ZVBC32-NEXT:    add a5, a5, a6
+; RV64ZVBC32-NEXT:    add a5, sp, a5
+; RV64ZVBC32-NEXT:    addi a5, a5, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a5) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV64ZVBC32-NEXT:    csrr a5, vlenb
+; RV64ZVBC32-NEXT:    slli a5, a5, 3
+; RV64ZVBC32-NEXT:    mv a6, a5
+; RV64ZVBC32-NEXT:    slli a5, a5, 1
+; RV64ZVBC32-NEXT:    add a6, a6, a5
+; RV64ZVBC32-NEXT:    slli a5, a5, 2
+; RV64ZVBC32-NEXT:    add a6, a6, a5
+; RV64ZVBC32-NEXT:    slli a5, a5, 3
+; RV64ZVBC32-NEXT:    add a5, a5, a6
+; RV64ZVBC32-NEXT:    add a5, sp, a5
+; RV64ZVBC32-NEXT:    addi a5, a5, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a5) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV64ZVBC32-NEXT:    csrr a5, vlenb
+; RV64ZVBC32-NEXT:    slli a5, a5, 4
+; RV64ZVBC32-NEXT:    mv a6, a5
+; RV64ZVBC32-NEXT:    slli a5, a5, 2
+; RV64ZVBC32-NEXT:    add a6, a6, a5
+; RV64ZVBC32-NEXT:    slli a5, a5, 3
+; RV64ZVBC32-NEXT:    add a5, a5, a6
+; RV64ZVBC32-NEXT:    add a5, sp, a5
+; RV64ZVBC32-NEXT:    addi a5, a5, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a5) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV64ZVBC32-NEXT:    csrr a5, vlenb
+; RV64ZVBC32-NEXT:    slli a5, a5, 3
+; RV64ZVBC32-NEXT:    mv a6, a5
+; RV64ZVBC32-NEXT:    slli a5, a5, 3
+; RV64ZVBC32-NEXT:    add a6, a6, a5
+; RV64ZVBC32-NEXT:    slli a5, a5, 3
+; RV64ZVBC32-NEXT:    add a5, a5, a6
+; RV64ZVBC32-NEXT:    add a5, sp, a5
+; RV64ZVBC32-NEXT:    addi a5, a5, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a5) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV64ZVBC32-NEXT:    csrr a5, vlenb
+; RV64ZVBC32-NEXT:    slli a5, a5, 6
+; RV64ZVBC32-NEXT:    mv a6, a5
+; RV64ZVBC32-NEXT:    slli a5, a5, 3
+; RV64ZVBC32-NEXT:    add a5, a5, a6
+; RV64ZVBC32-NEXT:    add a5, sp, a5
+; RV64ZVBC32-NEXT:    addi a5, a5, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a5) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v16, v8, v16
+; RV64ZVBC32-NEXT:    csrr a5, vlenb
+; RV64ZVBC32-NEXT:    slli a5, a5, 3
+; RV64ZVBC32-NEXT:    mv a6, a5
+; RV64ZVBC32-NEXT:    slli a5, a5, 1
+; RV64ZVBC32-NEXT:    add a6, a6, a5
+; RV64ZVBC32-NEXT:    slli a5, a5, 1
+; RV64ZVBC32-NEXT:    add a6, a6, a5
+; RV64ZVBC32-NEXT:    slli a5, a5, 4
+; RV64ZVBC32-NEXT:    add a5, a5, a6
+; RV64ZVBC32-NEXT:    add a5, sp, a5
+; RV64ZVBC32-NEXT:    addi a5, a5, 304
+; RV64ZVBC32-NEXT:    vl8r.v v0, (a5) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v0
+; RV64ZVBC32-NEXT:    vand.vx v24, v24, a3
+; RV64ZVBC32-NEXT:    vsll.vi v24, v24, 24
+; RV64ZVBC32-NEXT:    vand.vx v0, v8, a4
+; RV64ZVBC32-NEXT:    vsll.vi v0, v0, 8
+; RV64ZVBC32-NEXT:    vor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    csrr a3, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a3, 4
+; RV64ZVBC32-NEXT:    mv a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 3
+; RV64ZVBC32-NEXT:    add a3, a3, a4
+; RV64ZVBC32-NEXT:    add a3, sp, a3
+; RV64ZVBC32-NEXT:    addi a3, a3, 304
+; RV64ZVBC32-NEXT:    vl8r.v v0, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v0, v16, v0
+; RV64ZVBC32-NEXT:    csrr a3, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a3, 3
+; RV64ZVBC32-NEXT:    mv a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 2
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 2
+; RV64ZVBC32-NEXT:    add a3, a3, a4
+; RV64ZVBC32-NEXT:    add a3, sp, a3
+; RV64ZVBC32-NEXT:    addi a3, a3, 304
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    csrr a3, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a3, 3
+; RV64ZVBC32-NEXT:    mv a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 3
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 2
+; RV64ZVBC32-NEXT:    add a3, a3, a4
+; RV64ZVBC32-NEXT:    add a3, sp, a3
+; RV64ZVBC32-NEXT:    addi a3, a3, 304
+; RV64ZVBC32-NEXT:    vl8r.v v24, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v24, v0, v24
+; RV64ZVBC32-NEXT:    csrr a3, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a3, 4
+; RV64ZVBC32-NEXT:    mv a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 3
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 2
+; RV64ZVBC32-NEXT:    add a3, a3, a4
+; RV64ZVBC32-NEXT:    add a3, sp, a3
+; RV64ZVBC32-NEXT:    addi a3, a3, 304
+; RV64ZVBC32-NEXT:    vl8r.v v0, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    csrr a3, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a3, 3
+; RV64ZVBC32-NEXT:    mv a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 4
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 2
+; RV64ZVBC32-NEXT:    add a3, a3, a4
+; RV64ZVBC32-NEXT:    add a3, sp, a3
+; RV64ZVBC32-NEXT:    addi a3, a3, 304
+; RV64ZVBC32-NEXT:    vl8r.v v0, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    csrr a3, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a3, 7
+; RV64ZVBC32-NEXT:    mv a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 2
+; RV64ZVBC32-NEXT:    add a3, a3, a4
+; RV64ZVBC32-NEXT:    add a3, sp, a3
+; RV64ZVBC32-NEXT:    addi a3, a3, 304
+; RV64ZVBC32-NEXT:    vl8r.v v0, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    csrr a3, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a3, 3
+; RV64ZVBC32-NEXT:    mv a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 3
+; RV64ZVBC32-NEXT:    add a3, a3, a4
+; RV64ZVBC32-NEXT:    add a3, sp, a3
+; RV64ZVBC32-NEXT:    addi a3, a3, 304
+; RV64ZVBC32-NEXT:    vl8r.v v0, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vsrl.vx v8, v8, a2
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, a1
+; RV64ZVBC32-NEXT:    vsrl.vx v24, v24, a0
+; RV64ZVBC32-NEXT:    vor.vv v8, v8, v24
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 304
+; RV64ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vor.vv v8, v24, v8
+; RV64ZVBC32-NEXT:    vor.vv v8, v16, v8
+; RV64ZVBC32-NEXT:    vsrl.vi v16, v8, 4
+; RV64ZVBC32-NEXT:    ld a0, 256(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, a0
+; RV64ZVBC32-NEXT:    vand.vx v16, v16, a0
+; RV64ZVBC32-NEXT:    vsll.vi v8, v8, 4
+; RV64ZVBC32-NEXT:    vor.vv v8, v16, v8
+; RV64ZVBC32-NEXT:    vsrl.vi v16, v8, 2
+; RV64ZVBC32-NEXT:    ld a0, 264(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, a0
+; RV64ZVBC32-NEXT:    vand.vx v16, v16, a0
+; RV64ZVBC32-NEXT:    vsll.vi v8, v8, 2
+; RV64ZVBC32-NEXT:    vor.vv v8, v16, v8
+; RV64ZVBC32-NEXT:    vsrl.vi v16, v8, 1
+; RV64ZVBC32-NEXT:    ld a0, 272(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, a0
+; RV64ZVBC32-NEXT:    vand.vx v16, v16, a0
+; RV64ZVBC32-NEXT:    vadd.vv v8, v8, v8
+; RV64ZVBC32-NEXT:    vor.vv v8, v16, v8
+; RV64ZVBC32-NEXT:    vsrl.vi v8, v8, 1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 5
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add sp, sp, a0
+; RV64ZVBC32-NEXT:    ld ra, 408(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    ld s0, 400(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    ld s1, 392(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    ld s2, 384(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    ld s3, 376(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    ld s4, 368(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    ld s5, 360(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    ld s6, 352(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    ld s7, 344(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    ld s8, 336(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    ld s9, 328(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    ld s10, 320(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    ld s11, 312(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    addi sp, sp, 416
+; RV64ZVBC32-NEXT:    ret
   %va.ext = zext <vscale x 8 x i64> %va to <vscale x 8 x i128>
   %vb.ext = zext <vscale x 8 x i64> %vb to <vscale x 8 x i128>
   %clmul = call <vscale x 8 x i128> @llvm.clmul.nxv8i128(<vscale x 8 x i128> %va.ext, <vscale x 8 x i128> %vb.ext)
@@ -43458,23 +75012,4507 @@ define <vscale x 8 x i64> @clmulh_nxv8i64_vx(<vscale x 8 x i64> %va, i64 %b) nou
 ; RV64V-NEXT:    addi sp, sp, 16
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmulh_nxv8i64_vx:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    addi sp, sp, -16
-; RV32ZVBC-NEXT:    sw a0, 8(sp)
-; RV32ZVBC-NEXT:    sw a1, 12(sp)
-; RV32ZVBC-NEXT:    addi a0, sp, 8
-; RV32ZVBC-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32ZVBC-NEXT:    vlse64.v v16, (a0), zero
-; RV32ZVBC-NEXT:    vclmulh.vv v8, v8, v16
-; RV32ZVBC-NEXT:    addi sp, sp, 16
-; RV32ZVBC-NEXT:    ret
+; RV32ZVBC64-LABEL: clmulh_nxv8i64_vx:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    addi sp, sp, -16
+; RV32ZVBC64-NEXT:    sw a0, 8(sp)
+; RV32ZVBC64-NEXT:    sw a1, 12(sp)
+; RV32ZVBC64-NEXT:    addi a0, sp, 8
+; RV32ZVBC64-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32ZVBC64-NEXT:    vlse64.v v16, (a0), zero
+; RV32ZVBC64-NEXT:    vclmulh.vv v8, v8, v16
+; RV32ZVBC64-NEXT:    addi sp, sp, 16
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmulh_nxv8i64_vx:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV64ZVBC64-NEXT:    vclmulh.vx v8, v8, a0
+; RV64ZVBC64-NEXT:    ret
+;
+; RV32ZVBC32-LABEL: clmulh_nxv8i64_vx:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    addi sp, sp, -368
+; RV32ZVBC32-NEXT:    sw ra, 364(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s0, 360(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s1, 356(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s2, 352(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s3, 348(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s4, 344(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s5, 340(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s6, 336(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s7, 332(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s8, 328(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s9, 324(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s10, 320(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s11, 316(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a2, vlenb
+; RV32ZVBC32-NEXT:    slli a2, a2, 5
+; RV32ZVBC32-NEXT:    mv a3, a2
+; RV32ZVBC32-NEXT:    slli a2, a2, 2
+; RV32ZVBC32-NEXT:    add a3, a3, a2
+; RV32ZVBC32-NEXT:    slli a2, a2, 2
+; RV32ZVBC32-NEXT:    add a2, a2, a3
+; RV32ZVBC32-NEXT:    sub sp, sp, a2
+; RV32ZVBC32-NEXT:    csrr a2, vlenb
+; RV32ZVBC32-NEXT:    slli a2, a2, 3
+; RV32ZVBC32-NEXT:    mv a3, a2
+; RV32ZVBC32-NEXT:    slli a2, a2, 4
+; RV32ZVBC32-NEXT:    add a3, a3, a2
+; RV32ZVBC32-NEXT:    slli a2, a2, 2
+; RV32ZVBC32-NEXT:    add a2, a2, a3
+; RV32ZVBC32-NEXT:    add a2, sp, a2
+; RV32ZVBC32-NEXT:    addi a2, a2, 304
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw a0, 16(sp)
+; RV32ZVBC32-NEXT:    sw a1, 20(sp)
+; RV32ZVBC32-NEXT:    addi a2, sp, 16
+; RV32ZVBC32-NEXT:    lui t6, 16
+; RV32ZVBC32-NEXT:    li t5, 56
+; RV32ZVBC32-NEXT:    li t4, 40
+; RV32ZVBC32-NEXT:    vsetvli a3, zero, e64, m8, ta, ma
+; RV32ZVBC32-NEXT:    vlse64.v v24, (a2), zero
+; RV32ZVBC32-NEXT:    vsrl.vx v16, v8, t5
+; RV32ZVBC32-NEXT:    vsrl.vx v0, v8, t4
+; RV32ZVBC32-NEXT:    addi t3, t6, -256
+; RV32ZVBC32-NEXT:    vand.vx v0, v0, t3
+; RV32ZVBC32-NEXT:    vor.vv v16, v0, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 7
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v16, v8, t3
+; RV32ZVBC32-NEXT:    vsll.vx v16, v16, t4
+; RV32ZVBC32-NEXT:    vsll.vx v0, v8, t5
+; RV32ZVBC32-NEXT:    vor.vv v8, v0, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vsrl.vx v8, v24, t4
+; RV32ZVBC32-NEXT:    vand.vx v8, v8, t3
+; RV32ZVBC32-NEXT:    vsrl.vx v0, v24, t5
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vor.vv v8, v8, v0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v8, v24, t3
+; RV32ZVBC32-NEXT:    vsll.vx v8, v8, t4
+; RV32ZVBC32-NEXT:    vsll.vx v0, v24, t5
+; RV32ZVBC32-NEXT:    vor.vv v8, v0, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    lui a4, 1044480
+; RV32ZVBC32-NEXT:    lui a5, 524288
+; RV32ZVBC32-NEXT:    li ra, 1
+; RV32ZVBC32-NEXT:    li a6, 2
+; RV32ZVBC32-NEXT:    li a7, 4
+; RV32ZVBC32-NEXT:    li s0, 8
+; RV32ZVBC32-NEXT:    li s11, 16
+; RV32ZVBC32-NEXT:    li s10, 32
+; RV32ZVBC32-NEXT:    li s9, 64
+; RV32ZVBC32-NEXT:    li s8, 128
+; RV32ZVBC32-NEXT:    li s7, 256
+; RV32ZVBC32-NEXT:    li s6, 512
+; RV32ZVBC32-NEXT:    li s5, 1024
+; RV32ZVBC32-NEXT:    lui s4, 1
+; RV32ZVBC32-NEXT:    lui s3, 2
+; RV32ZVBC32-NEXT:    lui s2, 4
+; RV32ZVBC32-NEXT:    lui s1, 8
+; RV32ZVBC32-NEXT:    lui a0, 32
+; RV32ZVBC32-NEXT:    lui a1, 64
+; RV32ZVBC32-NEXT:    lui a2, 128
+; RV32ZVBC32-NEXT:    lui a3, 256
+; RV32ZVBC32-NEXT:    lui t1, 512
+; RV32ZVBC32-NEXT:    lui t0, 1024
+; RV32ZVBC32-NEXT:    lui t2, 2048
+; RV32ZVBC32-NEXT:    sw a4, 264(sp)
+; RV32ZVBC32-NEXT:    lui a4, 4096
+; RV32ZVBC32-NEXT:    sw zero, 268(sp)
+; RV32ZVBC32-NEXT:    sw a5, 24(sp)
+; RV32ZVBC32-NEXT:    sw zero, 28(sp)
+; RV32ZVBC32-NEXT:    sw zero, 288(sp)
+; RV32ZVBC32-NEXT:    sw ra, 292(sp)
+; RV32ZVBC32-NEXT:    sw zero, 272(sp)
+; RV32ZVBC32-NEXT:    sw a6, 276(sp)
+; RV32ZVBC32-NEXT:    lui a6, 8192
+; RV32ZVBC32-NEXT:    sw zero, 280(sp)
+; RV32ZVBC32-NEXT:    sw a7, 284(sp)
+; RV32ZVBC32-NEXT:    lui a7, 16384
+; RV32ZVBC32-NEXT:    sw zero, 256(sp)
+; RV32ZVBC32-NEXT:    sw s0, 260(sp)
+; RV32ZVBC32-NEXT:    lui s0, 32768
+; RV32ZVBC32-NEXT:    sw zero, 248(sp)
+; RV32ZVBC32-NEXT:    sw s11, 252(sp)
+; RV32ZVBC32-NEXT:    sw zero, 240(sp)
+; RV32ZVBC32-NEXT:    sw s10, 244(sp)
+; RV32ZVBC32-NEXT:    sw zero, 232(sp)
+; RV32ZVBC32-NEXT:    sw s9, 236(sp)
+; RV32ZVBC32-NEXT:    sw zero, 224(sp)
+; RV32ZVBC32-NEXT:    sw s8, 228(sp)
+; RV32ZVBC32-NEXT:    sw zero, 216(sp)
+; RV32ZVBC32-NEXT:    sw s7, 220(sp)
+; RV32ZVBC32-NEXT:    sw zero, 208(sp)
+; RV32ZVBC32-NEXT:    sw s6, 212(sp)
+; RV32ZVBC32-NEXT:    sw zero, 200(sp)
+; RV32ZVBC32-NEXT:    sw s5, 204(sp)
+; RV32ZVBC32-NEXT:    slli ra, ra, 11
+; RV32ZVBC32-NEXT:    sw zero, 192(sp)
+; RV32ZVBC32-NEXT:    sw ra, 196(sp)
+; RV32ZVBC32-NEXT:    sw zero, 184(sp)
+; RV32ZVBC32-NEXT:    sw s4, 188(sp)
+; RV32ZVBC32-NEXT:    sw zero, 176(sp)
+; RV32ZVBC32-NEXT:    sw s3, 180(sp)
+; RV32ZVBC32-NEXT:    sw zero, 168(sp)
+; RV32ZVBC32-NEXT:    sw s2, 172(sp)
+; RV32ZVBC32-NEXT:    sw zero, 160(sp)
+; RV32ZVBC32-NEXT:    sw s1, 164(sp)
+; RV32ZVBC32-NEXT:    sw zero, 152(sp)
+; RV32ZVBC32-NEXT:    sw t6, 156(sp)
+; RV32ZVBC32-NEXT:    sw zero, 144(sp)
+; RV32ZVBC32-NEXT:    sw a0, 148(sp)
+; RV32ZVBC32-NEXT:    sw zero, 136(sp)
+; RV32ZVBC32-NEXT:    sw a1, 140(sp)
+; RV32ZVBC32-NEXT:    sw zero, 128(sp)
+; RV32ZVBC32-NEXT:    sw a2, 132(sp)
+; RV32ZVBC32-NEXT:    sw zero, 120(sp)
+; RV32ZVBC32-NEXT:    sw a3, 124(sp)
+; RV32ZVBC32-NEXT:    sw zero, 112(sp)
+; RV32ZVBC32-NEXT:    sw t1, 116(sp)
+; RV32ZVBC32-NEXT:    sw zero, 104(sp)
+; RV32ZVBC32-NEXT:    sw t0, 108(sp)
+; RV32ZVBC32-NEXT:    sw zero, 96(sp)
+; RV32ZVBC32-NEXT:    sw t2, 100(sp)
+; RV32ZVBC32-NEXT:    sw zero, 88(sp)
+; RV32ZVBC32-NEXT:    sw a4, 92(sp)
+; RV32ZVBC32-NEXT:    sw zero, 80(sp)
+; RV32ZVBC32-NEXT:    sw a6, 84(sp)
+; RV32ZVBC32-NEXT:    lui a2, 8192
+; RV32ZVBC32-NEXT:    sw zero, 72(sp)
+; RV32ZVBC32-NEXT:    sw a7, 76(sp)
+; RV32ZVBC32-NEXT:    lui t2, 16384
+; RV32ZVBC32-NEXT:    sw zero, 64(sp)
+; RV32ZVBC32-NEXT:    sw s0, 68(sp)
+; RV32ZVBC32-NEXT:    lui a7, 65536
+; RV32ZVBC32-NEXT:    sw zero, 56(sp)
+; RV32ZVBC32-NEXT:    sw a7, 60(sp)
+; RV32ZVBC32-NEXT:    lui a6, 131072
+; RV32ZVBC32-NEXT:    sw zero, 48(sp)
+; RV32ZVBC32-NEXT:    sw a6, 52(sp)
+; RV32ZVBC32-NEXT:    lui a4, 262144
+; RV32ZVBC32-NEXT:    sw zero, 40(sp)
+; RV32ZVBC32-NEXT:    sw a4, 44(sp)
+; RV32ZVBC32-NEXT:    sw a5, 36(sp)
+; RV32ZVBC32-NEXT:    sw zero, 32(sp)
+; RV32ZVBC32-NEXT:    lui a3, 4080
+; RV32ZVBC32-NEXT:    addi t0, sp, 264
+; RV32ZVBC32-NEXT:    vlse64.v v16, (t0), zero
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vsrl.vi v8, v24, 24
+; RV32ZVBC32-NEXT:    vand.vx v8, v8, a3
+; RV32ZVBC32-NEXT:    vsrl.vi v0, v24, 8
+; RV32ZVBC32-NEXT:    vand.vv v0, v0, v16
+; RV32ZVBC32-NEXT:    vor.vv v8, v0, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vsrl.vi v0, v24, 24
+; RV32ZVBC32-NEXT:    vand.vx v0, v0, a3
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vsrl.vi v24, v24, 8
+; RV32ZVBC32-NEXT:    vand.vv v24, v24, v16
+; RV32ZVBC32-NEXT:    vor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 7
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 7
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vx v0, v16, a3
+; RV32ZVBC32-NEXT:    vsll.vi v0, v0, 24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v16, v16, v8
+; RV32ZVBC32-NEXT:    vsll.vi v16, v16, 8
+; RV32ZVBC32-NEXT:    vor.vv v16, v0, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vor.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vx v0, v8, a3
+; RV32ZVBC32-NEXT:    vsll.vi v0, v0, 24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    vsll.vi v8, v8, 8
+; RV32ZVBC32-NEXT:    vor.vv v8, v0, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vor.vv v16, v0, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vor.vv v8, v0, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 7
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vor.vv v0, v16, v24
+; RV32ZVBC32-NEXT:    lui t0, 61681
+; RV32ZVBC32-NEXT:    addi t0, t0, -241
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vor.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    vsetvli t1, zero, e32, m8, ta, ma
+; RV32ZVBC32-NEXT:    vmv.v.x v8, t0
+; RV32ZVBC32-NEXT:    vsetvli t0, zero, e64, m8, ta, ma
+; RV32ZVBC32-NEXT:    vsrl.vi v24, v0, 4
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v24, v0, v8
+; RV32ZVBC32-NEXT:    vmv8r.v v0, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v8, v8, v0
+; RV32ZVBC32-NEXT:    vsll.vi v24, v24, 4
+; RV32ZVBC32-NEXT:    vor.vv v8, v8, v24
+; RV32ZVBC32-NEXT:    vsrl.vi v24, v16, 4
+; RV32ZVBC32-NEXT:    vand.vv v16, v16, v0
+; RV32ZVBC32-NEXT:    vand.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vsll.vi v16, v16, 4
+; RV32ZVBC32-NEXT:    vor.vv v16, v24, v16
+; RV32ZVBC32-NEXT:    lui t0, 209715
+; RV32ZVBC32-NEXT:    addi t0, t0, 819
+; RV32ZVBC32-NEXT:    vsetvli t1, zero, e32, m8, ta, ma
+; RV32ZVBC32-NEXT:    vmv.v.x v0, t0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 7
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v0, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vsetvli t0, zero, e64, m8, ta, ma
+; RV32ZVBC32-NEXT:    vsrl.vi v24, v8, 2
+; RV32ZVBC32-NEXT:    vand.vv v8, v8, v0
+; RV32ZVBC32-NEXT:    vand.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vsll.vi v8, v8, 2
+; RV32ZVBC32-NEXT:    vor.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    vsrl.vi v24, v16, 2
+; RV32ZVBC32-NEXT:    vand.vv v16, v16, v0
+; RV32ZVBC32-NEXT:    vand.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vsll.vi v16, v16, 2
+; RV32ZVBC32-NEXT:    vor.vv v24, v24, v16
+; RV32ZVBC32-NEXT:    lui t0, 349525
+; RV32ZVBC32-NEXT:    addi t0, t0, 1365
+; RV32ZVBC32-NEXT:    vsetvli t1, zero, e32, m8, ta, ma
+; RV32ZVBC32-NEXT:    vmv.v.x v0, t0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v0, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vsetvli t0, zero, e64, m8, ta, ma
+; RV32ZVBC32-NEXT:    vsrl.vi v16, v8, 1
+; RV32ZVBC32-NEXT:    vand.vv v8, v8, v0
+; RV32ZVBC32-NEXT:    vand.vv v16, v16, v0
+; RV32ZVBC32-NEXT:    vadd.vv v8, v8, v8
+; RV32ZVBC32-NEXT:    vor.vv v16, v16, v8
+; RV32ZVBC32-NEXT:    vsrl.vi v8, v24, 1
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v8, v24, v0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi t0, sp, 24
+; RV32ZVBC32-NEXT:    vlse64.v v0, (t0), zero
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vadd.vv v24, v24, v24
+; RV32ZVBC32-NEXT:    vor.vv v8, v8, v24
+; RV32ZVBC32-NEXT:    vand.vv v24, v8, v0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi t0, sp, 288
+; RV32ZVBC32-NEXT:    addi t1, sp, 272
+; RV32ZVBC32-NEXT:    addi a1, sp, 280
+; RV32ZVBC32-NEXT:    addi a0, sp, 256
+; RV32ZVBC32-NEXT:    vlse64.v v24, (t0), zero
+; RV32ZVBC32-NEXT:    csrr a5, vlenb
+; RV32ZVBC32-NEXT:    slli a5, a5, 4
+; RV32ZVBC32-NEXT:    mv t0, a5
+; RV32ZVBC32-NEXT:    slli a5, a5, 1
+; RV32ZVBC32-NEXT:    add t0, t0, a5
+; RV32ZVBC32-NEXT:    slli a5, a5, 1
+; RV32ZVBC32-NEXT:    add t0, t0, a5
+; RV32ZVBC32-NEXT:    slli a5, a5, 3
+; RV32ZVBC32-NEXT:    add a5, a5, t0
+; RV32ZVBC32-NEXT:    add a5, sp, a5
+; RV32ZVBC32-NEXT:    addi a5, a5, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a5) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v0, (t1), zero
+; RV32ZVBC32-NEXT:    vlse64.v v24, (a1), zero
+; RV32ZVBC32-NEXT:    csrr a1, vlenb
+; RV32ZVBC32-NEXT:    slli a1, a1, 5
+; RV32ZVBC32-NEXT:    mv a5, a1
+; RV32ZVBC32-NEXT:    slli a1, a1, 1
+; RV32ZVBC32-NEXT:    add a5, a5, a1
+; RV32ZVBC32-NEXT:    slli a1, a1, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a5
+; RV32ZVBC32-NEXT:    add a1, sp, a1
+; RV32ZVBC32-NEXT:    addi a1, a1, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v24, (a0), zero
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v24, v8, v0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi a0, sp, 248
+; RV32ZVBC32-NEXT:    addi a1, sp, 240
+; RV32ZVBC32-NEXT:    addi t0, sp, 232
+; RV32ZVBC32-NEXT:    addi t1, sp, 224
+; RV32ZVBC32-NEXT:    vlse64.v v24, (a0), zero
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a5, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a5, a5, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a5
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v0, (a1), zero
+; RV32ZVBC32-NEXT:    vlse64.v v24, (t0), zero
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v24, (t1), zero
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v24, v8, v0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi a0, sp, 216
+; RV32ZVBC32-NEXT:    addi a1, sp, 208
+; RV32ZVBC32-NEXT:    addi t0, sp, 200
+; RV32ZVBC32-NEXT:    addi t1, sp, 192
+; RV32ZVBC32-NEXT:    vlse64.v v24, (a0), zero
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a5, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a5, a5, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a5
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v0, (a1), zero
+; RV32ZVBC32-NEXT:    vlse64.v v24, (t0), zero
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v24, (t1), zero
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v24, v8, v0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi a0, sp, 184
+; RV32ZVBC32-NEXT:    addi a1, sp, 176
+; RV32ZVBC32-NEXT:    addi t0, sp, 168
+; RV32ZVBC32-NEXT:    addi t1, sp, 160
+; RV32ZVBC32-NEXT:    vlse64.v v24, (a0), zero
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a5, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, a0, a5
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v0, (a1), zero
+; RV32ZVBC32-NEXT:    vlse64.v v24, (t0), zero
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 9
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v24, (t1), zero
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v24, v8, v0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 9
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 9
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi a0, sp, 152
+; RV32ZVBC32-NEXT:    addi a1, sp, 144
+; RV32ZVBC32-NEXT:    addi t0, sp, 136
+; RV32ZVBC32-NEXT:    addi t1, sp, 128
+; RV32ZVBC32-NEXT:    vlse64.v v24, (a0), zero
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a5, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a5, a5, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a5, a5, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a5, a5, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a5
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v0, (a1), zero
+; RV32ZVBC32-NEXT:    vlse64.v v24, (t0), zero
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v24, (t1), zero
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v24, v8, v0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi a0, sp, 120
+; RV32ZVBC32-NEXT:    addi a1, sp, 112
+; RV32ZVBC32-NEXT:    addi t0, sp, 104
+; RV32ZVBC32-NEXT:    addi t1, sp, 96
+; RV32ZVBC32-NEXT:    vlse64.v v24, (a0), zero
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a5, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a5, a5, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a5, a5, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a5
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v0, (a1), zero
+; RV32ZVBC32-NEXT:    vlse64.v v24, (t0), zero
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v24, (t1), zero
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v24, v8, v0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi a0, sp, 88
+; RV32ZVBC32-NEXT:    addi a1, sp, 80
+; RV32ZVBC32-NEXT:    addi t0, sp, 72
+; RV32ZVBC32-NEXT:    addi t1, sp, 64
+; RV32ZVBC32-NEXT:    vlse64.v v24, (a0), zero
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a5, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a5, a5, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a5, a5, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a5
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v0, (a1), zero
+; RV32ZVBC32-NEXT:    vlse64.v v24, (t0), zero
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v24, (t1), zero
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v24, v8, v0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi a0, sp, 56
+; RV32ZVBC32-NEXT:    addi a1, sp, 48
+; RV32ZVBC32-NEXT:    addi t0, sp, 40
+; RV32ZVBC32-NEXT:    addi t1, sp, 32
+; RV32ZVBC32-NEXT:    vlse64.v v24, (a0), zero
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a5, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a5, a5, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a5
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v0, (a1), zero
+; RV32ZVBC32-NEXT:    vlse64.v v24, (t0), zero
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 7
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vlse64.v v24, (t1), zero
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vv v24, v8, v0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 7
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 7
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v24, v8, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v24, v8, s11
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v24, v8, s10
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v24, v8, s9
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v24, v8, s8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v24, v8, s7
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v24, v8, s6
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v24, v8, s5
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v24, v8, ra
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v24, v8, s4
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v24, v8, s3
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v24, v8, s2
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v24, v8, s1
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v24, v8, t6
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    lui a0, 32
+; RV32ZVBC32-NEXT:    vand.vx v24, v8, a0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    lui a0, 64
+; RV32ZVBC32-NEXT:    vand.vx v24, v8, a0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 8
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    lui a0, 128
+; RV32ZVBC32-NEXT:    vand.vx v24, v8, a0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    lui a0, 256
+; RV32ZVBC32-NEXT:    vand.vx v24, v8, a0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    lui a0, 512
+; RV32ZVBC32-NEXT:    vand.vx v24, v8, a0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    lui a0, 1024
+; RV32ZVBC32-NEXT:    vand.vx v24, v8, a0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    lui a0, 2048
+; RV32ZVBC32-NEXT:    vand.vx v24, v8, a0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    lui a0, 4096
+; RV32ZVBC32-NEXT:    vand.vx v24, v8, a0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v24, v8, a2
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v24, v8, t2
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v24, v8, s0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v24, v8, a7
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v24, v8, a6
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vx v24, v8, a4
+; RV32ZVBC32-NEXT:    addi a0, sp, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vi v24, v8, 2
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vi v0, v8, 1
+; RV32ZVBC32-NEXT:    vand.vi v24, v8, 4
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vand.vi v8, v8, 8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v24, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vmul.vv v24, v16, v0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v0, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 7
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 8
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    addi a0, sp, 304
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 8
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 9
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 9
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 7
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 7
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v24, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vsll.vx v8, v8, t5
+; RV32ZVBC32-NEXT:    vand.vx v16, v16, t3
+; RV32ZVBC32-NEXT:    vsll.vx v16, v16, t4
+; RV32ZVBC32-NEXT:    vor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v24, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 8
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 9
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v0, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v0, v0, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v0, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    vsrl.vi v0, v24, 8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v0, v0, v16
+; RV32ZVBC32-NEXT:    vsrl.vi v8, v8, 24
+; RV32ZVBC32-NEXT:    vand.vx v8, v8, a3
+; RV32ZVBC32-NEXT:    vor.vv v8, v0, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 6
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v8, v16
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v0
+; RV32ZVBC32-NEXT:    vand.vx v24, v24, a3
+; RV32ZVBC32-NEXT:    vsll.vi v24, v24, 24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v0, v8, v0
+; RV32ZVBC32-NEXT:    vsll.vi v0, v0, 8
+; RV32ZVBC32-NEXT:    vor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v0, v16, v0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v24, v0, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 1
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC32-NEXT:    vsrl.vx v8, v8, t4
+; RV32ZVBC32-NEXT:    vand.vx v8, v8, t3
+; RV32ZVBC32-NEXT:    vsrl.vx v24, v24, t5
+; RV32ZVBC32-NEXT:    vor.vv v8, v8, v24
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vor.vv v8, v24, v8
+; RV32ZVBC32-NEXT:    vor.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    vsrl.vi v16, v8, 4
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v8, v8, v24
+; RV32ZVBC32-NEXT:    vand.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    vsll.vi v8, v8, 4
+; RV32ZVBC32-NEXT:    vor.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    vsrl.vi v16, v8, 2
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 7
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v8, v8, v24
+; RV32ZVBC32-NEXT:    vand.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    vsll.vi v8, v8, 2
+; RV32ZVBC32-NEXT:    vor.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    vsrl.vi v16, v8, 1
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 4
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 3
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add a0, sp, a0
+; RV32ZVBC32-NEXT:    addi a0, a0, 304
+; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC32-NEXT:    vand.vv v8, v8, v24
+; RV32ZVBC32-NEXT:    vand.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    vadd.vv v8, v8, v8
+; RV32ZVBC32-NEXT:    vor.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    vsrl.vi v8, v8, 1
+; RV32ZVBC32-NEXT:    csrr a0, vlenb
+; RV32ZVBC32-NEXT:    slli a0, a0, 5
+; RV32ZVBC32-NEXT:    mv a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a1, a1, a0
+; RV32ZVBC32-NEXT:    slli a0, a0, 2
+; RV32ZVBC32-NEXT:    add a0, a0, a1
+; RV32ZVBC32-NEXT:    add sp, sp, a0
+; RV32ZVBC32-NEXT:    lw ra, 364(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s0, 360(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s1, 356(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s2, 352(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s3, 348(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s4, 344(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s5, 340(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s6, 336(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s7, 332(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s8, 328(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s9, 324(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s10, 320(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s11, 316(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    addi sp, sp, 368
+; RV32ZVBC32-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmulh_nxv8i64_vx:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV64ZVBC-NEXT:    vclmulh.vx v8, v8, a0
-; RV64ZVBC-NEXT:    ret
+; RV64ZVBC32-LABEL: clmulh_nxv8i64_vx:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    addi sp, sp, -16
+; RV64ZVBC32-NEXT:    csrr a1, vlenb
+; RV64ZVBC32-NEXT:    slli a1, a1, 5
+; RV64ZVBC32-NEXT:    sub sp, sp, a1
+; RV64ZVBC32-NEXT:    li a1, 56
+; RV64ZVBC32-NEXT:    lui t2, 16
+; RV64ZVBC32-NEXT:    lui a2, 4080
+; RV64ZVBC32-NEXT:    li t0, 255
+; RV64ZVBC32-NEXT:    lui a3, 61681
+; RV64ZVBC32-NEXT:    lui a4, 209715
+; RV64ZVBC32-NEXT:    lui a5, 349525
+; RV64ZVBC32-NEXT:    srli a6, a0, 24
+; RV64ZVBC32-NEXT:    srli a7, a0, 8
+; RV64ZVBC32-NEXT:    srli t1, a0, 40
+; RV64ZVBC32-NEXT:    srli t3, a0, 56
+; RV64ZVBC32-NEXT:    addi a3, a3, -241
+; RV64ZVBC32-NEXT:    addi a4, a4, 819
+; RV64ZVBC32-NEXT:    addi t4, a5, 1365
+; RV64ZVBC32-NEXT:    slli a5, a3, 32
+; RV64ZVBC32-NEXT:    add a5, a3, a5
+; RV64ZVBC32-NEXT:    slli a3, a4, 32
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, t4, 32
+; RV64ZVBC32-NEXT:    add a3, t4, a3
+; RV64ZVBC32-NEXT:    srliw t4, a0, 24
+; RV64ZVBC32-NEXT:    slli t0, t0, 24
+; RV64ZVBC32-NEXT:    and a6, a6, a2
+; RV64ZVBC32-NEXT:    and a7, a7, t0
+; RV64ZVBC32-NEXT:    or t5, a7, a6
+; RV64ZVBC32-NEXT:    addi a6, t2, -256
+; RV64ZVBC32-NEXT:    and a7, t1, a6
+; RV64ZVBC32-NEXT:    or t1, a7, t3
+; RV64ZVBC32-NEXT:    and a7, a0, a2
+; RV64ZVBC32-NEXT:    slli t4, t4, 32
+; RV64ZVBC32-NEXT:    slli a7, a7, 24
+; RV64ZVBC32-NEXT:    or t3, a7, t4
+; RV64ZVBC32-NEXT:    li a7, 40
+; RV64ZVBC32-NEXT:    vsetvli t4, zero, e64, m8, ta, ma
+; RV64ZVBC32-NEXT:    vsrl.vi v24, v8, 24
+; RV64ZVBC32-NEXT:    vsrl.vx v16, v8, a1
+; RV64ZVBC32-NEXT:    vsrl.vx v0, v8, a7
+; RV64ZVBC32-NEXT:    vand.vx v0, v0, a6
+; RV64ZVBC32-NEXT:    vor.vv v16, v0, v16
+; RV64ZVBC32-NEXT:    vsrl.vi v0, v8, 8
+; RV64ZVBC32-NEXT:    or t1, t5, t1
+; RV64ZVBC32-NEXT:    slli t4, a0, 56
+; RV64ZVBC32-NEXT:    and a0, a0, a6
+; RV64ZVBC32-NEXT:    slli a0, a0, 40
+; RV64ZVBC32-NEXT:    or t4, t4, a0
+; RV64ZVBC32-NEXT:    li a0, 1
+; RV64ZVBC32-NEXT:    or t4, t4, t3
+; RV64ZVBC32-NEXT:    lui t3, 1
+; RV64ZVBC32-NEXT:    vand.vx v24, v24, a2
+; RV64ZVBC32-NEXT:    vand.vx v0, v0, t0
+; RV64ZVBC32-NEXT:    vor.vv v24, v0, v24
+; RV64ZVBC32-NEXT:    vand.vx v0, v8, a2
+; RV64ZVBC32-NEXT:    vsll.vi v0, v0, 24
+; RV64ZVBC32-NEXT:    vor.vv v16, v24, v16
+; RV64ZVBC32-NEXT:    vand.vx v24, v8, t0
+; RV64ZVBC32-NEXT:    vsll.vi v24, v24, 8
+; RV64ZVBC32-NEXT:    vor.vv v24, v0, v24
+; RV64ZVBC32-NEXT:    vsll.vx v0, v8, a1
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, a6
+; RV64ZVBC32-NEXT:    vsll.vx v8, v8, a7
+; RV64ZVBC32-NEXT:    vor.vv v8, v0, v8
+; RV64ZVBC32-NEXT:    vor.vv v8, v8, v24
+; RV64ZVBC32-NEXT:    vor.vv v8, v8, v16
+; RV64ZVBC32-NEXT:    or t1, t4, t1
+; RV64ZVBC32-NEXT:    vsrl.vi v16, v8, 4
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, a5
+; RV64ZVBC32-NEXT:    srli t4, t1, 4
+; RV64ZVBC32-NEXT:    and t1, t1, a5
+; RV64ZVBC32-NEXT:    vand.vx v16, v16, a5
+; RV64ZVBC32-NEXT:    vsll.vi v8, v8, 4
+; RV64ZVBC32-NEXT:    and t4, t4, a5
+; RV64ZVBC32-NEXT:    slli t1, t1, 4
+; RV64ZVBC32-NEXT:    vor.vv v8, v16, v8
+; RV64ZVBC32-NEXT:    or t1, t4, t1
+; RV64ZVBC32-NEXT:    vsrl.vi v16, v8, 2
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, a4
+; RV64ZVBC32-NEXT:    srli t4, t1, 2
+; RV64ZVBC32-NEXT:    and t1, t1, a4
+; RV64ZVBC32-NEXT:    vand.vx v16, v16, a4
+; RV64ZVBC32-NEXT:    vsll.vi v8, v8, 2
+; RV64ZVBC32-NEXT:    and t4, t4, a4
+; RV64ZVBC32-NEXT:    slli t1, t1, 2
+; RV64ZVBC32-NEXT:    vor.vv v8, v16, v8
+; RV64ZVBC32-NEXT:    or t1, t4, t1
+; RV64ZVBC32-NEXT:    vsrl.vi v16, v8, 1
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, a3
+; RV64ZVBC32-NEXT:    srli t4, t1, 1
+; RV64ZVBC32-NEXT:    and t1, t1, a3
+; RV64ZVBC32-NEXT:    vand.vx v16, v16, a3
+; RV64ZVBC32-NEXT:    vadd.vv v8, v8, v8
+; RV64ZVBC32-NEXT:    and t4, t4, a3
+; RV64ZVBC32-NEXT:    slli t1, t1, 1
+; RV64ZVBC32-NEXT:    vor.vv v8, v16, v8
+; RV64ZVBC32-NEXT:    or t1, t4, t1
+; RV64ZVBC32-NEXT:    andi t4, t1, 2
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, t4
+; RV64ZVBC32-NEXT:    andi t4, t1, 1
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, t4
+; RV64ZVBC32-NEXT:    andi t4, t1, 4
+; RV64ZVBC32-NEXT:    vmul.vx v0, v8, t4
+; RV64ZVBC32-NEXT:    andi t4, t1, 8
+; RV64ZVBC32-NEXT:    vxor.vv v16, v24, v16
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, t4
+; RV64ZVBC32-NEXT:    andi t4, t1, 16
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v0
+; RV64ZVBC32-NEXT:    vmul.vx v0, v8, t4
+; RV64ZVBC32-NEXT:    andi t4, t1, 32
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, t4
+; RV64ZVBC32-NEXT:    andi t4, t1, 64
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v0
+; RV64ZVBC32-NEXT:    vmul.vx v0, v8, t4
+; RV64ZVBC32-NEXT:    andi t4, t1, 128
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, t4
+; RV64ZVBC32-NEXT:    andi t4, t1, 256
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v0
+; RV64ZVBC32-NEXT:    vmul.vx v0, v8, t4
+; RV64ZVBC32-NEXT:    andi t4, t1, 512
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    csrr t5, vlenb
+; RV64ZVBC32-NEXT:    slli t5, t5, 3
+; RV64ZVBC32-NEXT:    mv t6, t5
+; RV64ZVBC32-NEXT:    slli t5, t5, 1
+; RV64ZVBC32-NEXT:    add t5, t5, t6
+; RV64ZVBC32-NEXT:    add t5, sp, t5
+; RV64ZVBC32-NEXT:    addi t5, t5, 16
+; RV64ZVBC32-NEXT:    vs8r.v v16, (t5) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, t4
+; RV64ZVBC32-NEXT:    andi t4, t1, 1024
+; RV64ZVBC32-NEXT:    vxor.vv v0, v16, v0
+; RV64ZVBC32-NEXT:    vxor.vv v24, v0, v24
+; RV64ZVBC32-NEXT:    vmul.vx v0, v8, t4
+; RV64ZVBC32-NEXT:    slli t4, a0, 11
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vmul.vx v0, v8, t4
+; RV64ZVBC32-NEXT:    lui t4, 2
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vmul.vx v0, v8, t3
+; RV64ZVBC32-NEXT:    lui t3, 4
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vmul.vx v0, v8, t4
+; RV64ZVBC32-NEXT:    lui t4, 8
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vmul.vx v0, v8, t3
+; RV64ZVBC32-NEXT:    lui t3, 32
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vmul.vx v0, v8, t4
+; RV64ZVBC32-NEXT:    lui t4, 64
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vmul.vx v0, v8, t2
+; RV64ZVBC32-NEXT:    lui t2, 128
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vmul.vx v0, v8, t3
+; RV64ZVBC32-NEXT:    lui t3, 256
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vmul.vx v0, v8, t4
+; RV64ZVBC32-NEXT:    lui t4, 512
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vmul.vx v0, v8, t2
+; RV64ZVBC32-NEXT:    lui t2, 1024
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vmul.vx v0, v8, t3
+; RV64ZVBC32-NEXT:    lui t3, 2048
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vmul.vx v0, v8, t4
+; RV64ZVBC32-NEXT:    lui t4, 4096
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vmul.vx v0, v8, t2
+; RV64ZVBC32-NEXT:    lui t2, 8192
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    vmul.vx v0, v8, t3
+; RV64ZVBC32-NEXT:    vxor.vv v0, v24, v0
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, t4
+; RV64ZVBC32-NEXT:    vxor.vv v24, v0, v24
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, t2
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v16
+; RV64ZVBC32-NEXT:    lui t2, 16384
+; RV64ZVBC32-NEXT:    lui t3, 32768
+; RV64ZVBC32-NEXT:    lui t4, 65536
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    csrr t5, vlenb
+; RV64ZVBC32-NEXT:    slli t5, t5, 3
+; RV64ZVBC32-NEXT:    mv t6, t5
+; RV64ZVBC32-NEXT:    slli t5, t5, 1
+; RV64ZVBC32-NEXT:    add t5, t5, t6
+; RV64ZVBC32-NEXT:    add t5, sp, t5
+; RV64ZVBC32-NEXT:    addi t5, t5, 16
+; RV64ZVBC32-NEXT:    vl8r.v v16, (t5) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vsll.vx v16, v16, a1
+; RV64ZVBC32-NEXT:    vand.vx v0, v0, a6
+; RV64ZVBC32-NEXT:    vsll.vx v0, v0, a7
+; RV64ZVBC32-NEXT:    vor.vv v16, v16, v0
+; RV64ZVBC32-NEXT:    csrr t5, vlenb
+; RV64ZVBC32-NEXT:    slli t5, t5, 3
+; RV64ZVBC32-NEXT:    add t5, sp, t5
+; RV64ZVBC32-NEXT:    addi t5, t5, 16
+; RV64ZVBC32-NEXT:    vs8r.v v16, (t5) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, t2
+; RV64ZVBC32-NEXT:    lui t2, 131072
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vxor.vv v16, v24, v16
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, t3
+; RV64ZVBC32-NEXT:    lui t3, 262144
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, t4
+; RV64ZVBC32-NEXT:    slli t4, a0, 32
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, t2
+; RV64ZVBC32-NEXT:    slli t2, a0, 33
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, t3
+; RV64ZVBC32-NEXT:    srliw t3, t1, 31
+; RV64ZVBC32-NEXT:    slli t3, t3, 31
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, t3
+; RV64ZVBC32-NEXT:    slli t3, a0, 34
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, t4
+; RV64ZVBC32-NEXT:    slli t4, a0, 35
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, t2
+; RV64ZVBC32-NEXT:    slli t2, a0, 36
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, t3
+; RV64ZVBC32-NEXT:    slli t3, a0, 37
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, t4
+; RV64ZVBC32-NEXT:    slli t4, a0, 38
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, t2
+; RV64ZVBC32-NEXT:    slli t2, a0, 39
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, t3
+; RV64ZVBC32-NEXT:    slli t3, a0, 40
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, t4
+; RV64ZVBC32-NEXT:    slli t4, a0, 41
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, t2
+; RV64ZVBC32-NEXT:    vxor.vv v24, v16, v24
+; RV64ZVBC32-NEXT:    csrr t2, vlenb
+; RV64ZVBC32-NEXT:    slli t2, t2, 3
+; RV64ZVBC32-NEXT:    mv t5, t2
+; RV64ZVBC32-NEXT:    slli t2, t2, 1
+; RV64ZVBC32-NEXT:    add t2, t2, t5
+; RV64ZVBC32-NEXT:    add t2, sp, t2
+; RV64ZVBC32-NEXT:    addi t2, t2, 16
+; RV64ZVBC32-NEXT:    vs8r.v v24, (t2) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, t3
+; RV64ZVBC32-NEXT:    vxor.vv v16, v24, v16
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, t4
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    slli t2, a0, 42
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, t2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    slli t2, a0, 43
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, t2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    slli t2, a0, 44
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, t2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    slli t2, a0, 45
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, t2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    slli t2, a0, 46
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, t2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    slli t2, a0, 47
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, t2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    slli t2, a0, 48
+; RV64ZVBC32-NEXT:    slli t3, a0, 49
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, t2
+; RV64ZVBC32-NEXT:    vxor.vv v24, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v0, v8, t3
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    slli t2, a0, 50
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vmul.vx v0, v8, t2
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    csrr t2, vlenb
+; RV64ZVBC32-NEXT:    slli t2, t2, 3
+; RV64ZVBC32-NEXT:    mv t3, t2
+; RV64ZVBC32-NEXT:    slli t2, t2, 1
+; RV64ZVBC32-NEXT:    add t2, t2, t3
+; RV64ZVBC32-NEXT:    add t2, sp, t2
+; RV64ZVBC32-NEXT:    addi t2, t2, 16
+; RV64ZVBC32-NEXT:    vl8r.v v0, (t2) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vsrl.vi v0, v0, 8
+; RV64ZVBC32-NEXT:    vand.vx v0, v0, t0
+; RV64ZVBC32-NEXT:    vsrl.vi v16, v16, 24
+; RV64ZVBC32-NEXT:    vand.vx v16, v16, a2
+; RV64ZVBC32-NEXT:    vor.vv v16, v0, v16
+; RV64ZVBC32-NEXT:    addi t2, sp, 16
+; RV64ZVBC32-NEXT:    vs8r.v v16, (t2) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli t2, a0, 51
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, t2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v24, v16
+; RV64ZVBC32-NEXT:    slli t2, a0, 52
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, t2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    slli t2, a0, 53
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, t2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    slli t2, a0, 54
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, t2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    slli t2, a0, 55
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, t2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC32-NEXT:    csrr t2, vlenb
+; RV64ZVBC32-NEXT:    slli t2, t2, 4
+; RV64ZVBC32-NEXT:    add t2, sp, t2
+; RV64ZVBC32-NEXT:    addi t2, t2, 16
+; RV64ZVBC32-NEXT:    vs8r.v v16, (t2) # vscale x 64-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli t2, a0, 56
+; RV64ZVBC32-NEXT:    slli t3, a0, 57
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vmul.vx v24, v8, t2
+; RV64ZVBC32-NEXT:    vxor.vv v24, v16, v24
+; RV64ZVBC32-NEXT:    vmul.vx v0, v8, t3
+; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC32-NEXT:    csrr t2, vlenb
+; RV64ZVBC32-NEXT:    slli t2, t2, 3
+; RV64ZVBC32-NEXT:    mv t3, t2
+; RV64ZVBC32-NEXT:    slli t2, t2, 1
+; RV64ZVBC32-NEXT:    add t2, t2, t3
+; RV64ZVBC32-NEXT:    add t2, sp, t2
+; RV64ZVBC32-NEXT:    addi t2, t2, 16
+; RV64ZVBC32-NEXT:    vl8r.v v16, (t2) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v0, v16, a2
+; RV64ZVBC32-NEXT:    vsll.vi v0, v0, 24
+; RV64ZVBC32-NEXT:    csrr a2, vlenb
+; RV64ZVBC32-NEXT:    slli a2, a2, 4
+; RV64ZVBC32-NEXT:    add a2, sp, a2
+; RV64ZVBC32-NEXT:    addi a2, a2, 16
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v16, v16, t0
+; RV64ZVBC32-NEXT:    vsll.vi v16, v16, 8
+; RV64ZVBC32-NEXT:    vor.vv v16, v0, v16
+; RV64ZVBC32-NEXT:    slli a2, a0, 58
+; RV64ZVBC32-NEXT:    and a2, t1, a2
+; RV64ZVBC32-NEXT:    vmul.vx v0, v8, a2
+; RV64ZVBC32-NEXT:    vxor.vv v0, v24, v0
+; RV64ZVBC32-NEXT:    csrr a2, vlenb
+; RV64ZVBC32-NEXT:    slli a2, a2, 3
+; RV64ZVBC32-NEXT:    add a2, sp, a2
+; RV64ZVBC32-NEXT:    addi a2, a2, 16
+; RV64ZVBC32-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vor.vv v24, v24, v16
+; RV64ZVBC32-NEXT:    slli a2, a0, 59
+; RV64ZVBC32-NEXT:    and a2, t1, a2
+; RV64ZVBC32-NEXT:    vmul.vx v16, v8, a2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v0, v16
+; RV64ZVBC32-NEXT:    slli a2, a0, 60
+; RV64ZVBC32-NEXT:    and a2, t1, a2
+; RV64ZVBC32-NEXT:    vmul.vx v0, v8, a2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v0
+; RV64ZVBC32-NEXT:    slli a2, a0, 61
+; RV64ZVBC32-NEXT:    and a2, t1, a2
+; RV64ZVBC32-NEXT:    vmul.vx v0, v8, a2
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v0
+; RV64ZVBC32-NEXT:    slli a0, a0, 62
+; RV64ZVBC32-NEXT:    and a0, t1, a0
+; RV64ZVBC32-NEXT:    vmul.vx v0, v8, a0
+; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v0
+; RV64ZVBC32-NEXT:    srli a0, t1, 63
+; RV64ZVBC32-NEXT:    slli a0, a0, 63
+; RV64ZVBC32-NEXT:    vmul.vx v8, v8, a0
+; RV64ZVBC32-NEXT:    vxor.vv v8, v16, v8
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 16
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vsrl.vx v16, v16, a7
+; RV64ZVBC32-NEXT:    vand.vx v16, v16, a6
+; RV64ZVBC32-NEXT:    vsrl.vx v8, v8, a1
+; RV64ZVBC32-NEXT:    vor.vv v8, v16, v8
+; RV64ZVBC32-NEXT:    addi a0, sp, 16
+; RV64ZVBC32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC32-NEXT:    vor.vv v8, v16, v8
+; RV64ZVBC32-NEXT:    vor.vv v8, v24, v8
+; RV64ZVBC32-NEXT:    vsrl.vi v16, v8, 4
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, a5
+; RV64ZVBC32-NEXT:    vand.vx v16, v16, a5
+; RV64ZVBC32-NEXT:    vsll.vi v8, v8, 4
+; RV64ZVBC32-NEXT:    vor.vv v8, v16, v8
+; RV64ZVBC32-NEXT:    vsrl.vi v16, v8, 2
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, a4
+; RV64ZVBC32-NEXT:    vand.vx v16, v16, a4
+; RV64ZVBC32-NEXT:    vsll.vi v8, v8, 2
+; RV64ZVBC32-NEXT:    vor.vv v8, v16, v8
+; RV64ZVBC32-NEXT:    vsrl.vi v16, v8, 1
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, a3
+; RV64ZVBC32-NEXT:    vand.vx v16, v16, a3
+; RV64ZVBC32-NEXT:    vadd.vv v8, v8, v8
+; RV64ZVBC32-NEXT:    vor.vv v8, v16, v8
+; RV64ZVBC32-NEXT:    vsrl.vi v8, v8, 1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 5
+; RV64ZVBC32-NEXT:    add sp, sp, a0
+; RV64ZVBC32-NEXT:    addi sp, sp, 16
+; RV64ZVBC32-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x i64> poison, i64 %b, i128 0
   %vb = shufflevector <vscale x 8 x i64> %elt.head, <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
   %va.ext = zext <vscale x 8 x i64> %va to <vscale x 8 x i128>
@@ -45806,17 +81844,2337 @@ define <vscale x 1 x i64> @clmulh_nxv1i64_vv_mask(<vscale x 1 x i64> %va, <vscal
 ; RV64V-NEXT:    .cfi_def_cfa_offset 0
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmulh_nxv1i64_vv_mask:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
-; RV32ZVBC-NEXT:    vclmulh.vv v8, v8, v9, v0.t
-; RV32ZVBC-NEXT:    ret
+; RV32ZVBC64-LABEL: clmulh_nxv1i64_vv_mask:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
+; RV32ZVBC64-NEXT:    vclmulh.vv v8, v8, v9, v0.t
+; RV32ZVBC64-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmulh_nxv1i64_vv_mask:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
-; RV64ZVBC-NEXT:    vclmulh.vv v8, v8, v9, v0.t
-; RV64ZVBC-NEXT:    ret
+; RV64ZVBC64-LABEL: clmulh_nxv1i64_vv_mask:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
+; RV64ZVBC64-NEXT:    vclmulh.vv v8, v8, v9, v0.t
+; RV64ZVBC64-NEXT:    ret
+;
+; RV32ZVBC32-LABEL: clmulh_nxv1i64_vv_mask:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    addi sp, sp, -336
+; RV32ZVBC32-NEXT:    .cfi_def_cfa_offset 336
+; RV32ZVBC32-NEXT:    sw ra, 332(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s0, 328(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s1, 324(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s2, 320(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s3, 316(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s4, 312(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s5, 308(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s6, 304(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s7, 300(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s8, 296(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s9, 292(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s10, 288(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s11, 284(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    .cfi_offset ra, -4
+; RV32ZVBC32-NEXT:    .cfi_offset s0, -8
+; RV32ZVBC32-NEXT:    .cfi_offset s1, -12
+; RV32ZVBC32-NEXT:    .cfi_offset s2, -16
+; RV32ZVBC32-NEXT:    .cfi_offset s3, -20
+; RV32ZVBC32-NEXT:    .cfi_offset s4, -24
+; RV32ZVBC32-NEXT:    .cfi_offset s5, -28
+; RV32ZVBC32-NEXT:    .cfi_offset s6, -32
+; RV32ZVBC32-NEXT:    .cfi_offset s7, -36
+; RV32ZVBC32-NEXT:    .cfi_offset s8, -40
+; RV32ZVBC32-NEXT:    .cfi_offset s9, -44
+; RV32ZVBC32-NEXT:    .cfi_offset s10, -48
+; RV32ZVBC32-NEXT:    .cfi_offset s11, -52
+; RV32ZVBC32-NEXT:    lui a0, 1044480
+; RV32ZVBC32-NEXT:    lui t6, 524288
+; RV32ZVBC32-NEXT:    li s11, 1
+; RV32ZVBC32-NEXT:    li s6, 2
+; RV32ZVBC32-NEXT:    li t5, 4
+; RV32ZVBC32-NEXT:    li s8, 8
+; RV32ZVBC32-NEXT:    li s10, 16
+; RV32ZVBC32-NEXT:    li s9, 32
+; RV32ZVBC32-NEXT:    li s1, 64
+; RV32ZVBC32-NEXT:    li s2, 128
+; RV32ZVBC32-NEXT:    li s3, 256
+; RV32ZVBC32-NEXT:    li s4, 512
+; RV32ZVBC32-NEXT:    li s5, 1024
+; RV32ZVBC32-NEXT:    lui t4, 1
+; RV32ZVBC32-NEXT:    lui t3, 2
+; RV32ZVBC32-NEXT:    lui t2, 4
+; RV32ZVBC32-NEXT:    lui t1, 8
+; RV32ZVBC32-NEXT:    lui s0, 16
+; RV32ZVBC32-NEXT:    lui t0, 32
+; RV32ZVBC32-NEXT:    lui a7, 64
+; RV32ZVBC32-NEXT:    lui a6, 128
+; RV32ZVBC32-NEXT:    lui a5, 256
+; RV32ZVBC32-NEXT:    lui a4, 512
+; RV32ZVBC32-NEXT:    lui a3, 1024
+; RV32ZVBC32-NEXT:    lui a2, 2048
+; RV32ZVBC32-NEXT:    lui a1, 4096
+; RV32ZVBC32-NEXT:    lui s7, 8192
+; RV32ZVBC32-NEXT:    lui ra, 16384
+; RV32ZVBC32-NEXT:    sw a0, 248(sp)
+; RV32ZVBC32-NEXT:    lui a0, 32768
+; RV32ZVBC32-NEXT:    sw zero, 252(sp)
+; RV32ZVBC32-NEXT:    sw t6, 8(sp)
+; RV32ZVBC32-NEXT:    sw zero, 12(sp)
+; RV32ZVBC32-NEXT:    sw zero, 272(sp)
+; RV32ZVBC32-NEXT:    sw s11, 276(sp)
+; RV32ZVBC32-NEXT:    sw zero, 256(sp)
+; RV32ZVBC32-NEXT:    sw s6, 260(sp)
+; RV32ZVBC32-NEXT:    lui s6, 65536
+; RV32ZVBC32-NEXT:    sw zero, 264(sp)
+; RV32ZVBC32-NEXT:    sw t5, 268(sp)
+; RV32ZVBC32-NEXT:    lui t5, 131072
+; RV32ZVBC32-NEXT:    sw zero, 240(sp)
+; RV32ZVBC32-NEXT:    sw s8, 244(sp)
+; RV32ZVBC32-NEXT:    lui s8, 262144
+; RV32ZVBC32-NEXT:    sw zero, 232(sp)
+; RV32ZVBC32-NEXT:    sw s10, 236(sp)
+; RV32ZVBC32-NEXT:    sw zero, 224(sp)
+; RV32ZVBC32-NEXT:    sw s9, 228(sp)
+; RV32ZVBC32-NEXT:    sw zero, 216(sp)
+; RV32ZVBC32-NEXT:    sw s1, 220(sp)
+; RV32ZVBC32-NEXT:    sw zero, 208(sp)
+; RV32ZVBC32-NEXT:    sw s2, 212(sp)
+; RV32ZVBC32-NEXT:    sw zero, 200(sp)
+; RV32ZVBC32-NEXT:    sw s3, 204(sp)
+; RV32ZVBC32-NEXT:    sw zero, 192(sp)
+; RV32ZVBC32-NEXT:    sw s4, 196(sp)
+; RV32ZVBC32-NEXT:    sw zero, 184(sp)
+; RV32ZVBC32-NEXT:    sw s5, 188(sp)
+; RV32ZVBC32-NEXT:    slli s11, s11, 11
+; RV32ZVBC32-NEXT:    sw zero, 176(sp)
+; RV32ZVBC32-NEXT:    sw s11, 180(sp)
+; RV32ZVBC32-NEXT:    sw zero, 168(sp)
+; RV32ZVBC32-NEXT:    sw t4, 172(sp)
+; RV32ZVBC32-NEXT:    sw zero, 160(sp)
+; RV32ZVBC32-NEXT:    sw t3, 164(sp)
+; RV32ZVBC32-NEXT:    sw zero, 152(sp)
+; RV32ZVBC32-NEXT:    sw t2, 156(sp)
+; RV32ZVBC32-NEXT:    sw zero, 144(sp)
+; RV32ZVBC32-NEXT:    sw t1, 148(sp)
+; RV32ZVBC32-NEXT:    lui t2, 8
+; RV32ZVBC32-NEXT:    sw zero, 136(sp)
+; RV32ZVBC32-NEXT:    sw s0, 140(sp)
+; RV32ZVBC32-NEXT:    sw zero, 128(sp)
+; RV32ZVBC32-NEXT:    sw t0, 132(sp)
+; RV32ZVBC32-NEXT:    sw zero, 120(sp)
+; RV32ZVBC32-NEXT:    sw a7, 124(sp)
+; RV32ZVBC32-NEXT:    sw zero, 112(sp)
+; RV32ZVBC32-NEXT:    sw a6, 116(sp)
+; RV32ZVBC32-NEXT:    sw zero, 104(sp)
+; RV32ZVBC32-NEXT:    sw a5, 108(sp)
+; RV32ZVBC32-NEXT:    lui t3, 256
+; RV32ZVBC32-NEXT:    sw zero, 96(sp)
+; RV32ZVBC32-NEXT:    sw a4, 100(sp)
+; RV32ZVBC32-NEXT:    lui t4, 512
+; RV32ZVBC32-NEXT:    sw zero, 88(sp)
+; RV32ZVBC32-NEXT:    sw a3, 92(sp)
+; RV32ZVBC32-NEXT:    lui a5, 1024
+; RV32ZVBC32-NEXT:    sw zero, 80(sp)
+; RV32ZVBC32-NEXT:    sw a2, 84(sp)
+; RV32ZVBC32-NEXT:    lui a4, 2048
+; RV32ZVBC32-NEXT:    sw zero, 72(sp)
+; RV32ZVBC32-NEXT:    sw a1, 76(sp)
+; RV32ZVBC32-NEXT:    lui a3, 4096
+; RV32ZVBC32-NEXT:    sw zero, 64(sp)
+; RV32ZVBC32-NEXT:    sw s7, 68(sp)
+; RV32ZVBC32-NEXT:    lui t1, 8192
+; RV32ZVBC32-NEXT:    sw zero, 56(sp)
+; RV32ZVBC32-NEXT:    sw ra, 60(sp)
+; RV32ZVBC32-NEXT:    sw zero, 48(sp)
+; RV32ZVBC32-NEXT:    sw a0, 52(sp)
+; RV32ZVBC32-NEXT:    lui s7, 32768
+; RV32ZVBC32-NEXT:    sw zero, 40(sp)
+; RV32ZVBC32-NEXT:    sw s6, 44(sp)
+; RV32ZVBC32-NEXT:    sw zero, 32(sp)
+; RV32ZVBC32-NEXT:    sw t5, 36(sp)
+; RV32ZVBC32-NEXT:    sw zero, 24(sp)
+; RV32ZVBC32-NEXT:    sw s8, 28(sp)
+; RV32ZVBC32-NEXT:    sw zero, 16(sp)
+; RV32ZVBC32-NEXT:    sw t6, 20(sp)
+; RV32ZVBC32-NEXT:    lui a0, 61681
+; RV32ZVBC32-NEXT:    addi a0, a0, -241
+; RV32ZVBC32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; RV32ZVBC32-NEXT:    vmv.v.x v10, a0
+; RV32ZVBC32-NEXT:    lui a0, 209715
+; RV32ZVBC32-NEXT:    addi a0, a0, 819
+; RV32ZVBC32-NEXT:    vmv.v.x v11, a0
+; RV32ZVBC32-NEXT:    lui a0, 349525
+; RV32ZVBC32-NEXT:    addi a0, a0, 1365
+; RV32ZVBC32-NEXT:    vmv.v.x v12, a0
+; RV32ZVBC32-NEXT:    addi a0, sp, 248
+; RV32ZVBC32-NEXT:    vsetvli a1, zero, e64, m1, ta, mu
+; RV32ZVBC32-NEXT:    vlse64.v v13, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 8
+; RV32ZVBC32-NEXT:    vlse64.v v6, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 272
+; RV32ZVBC32-NEXT:    vlse64.v v7, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 256
+; RV32ZVBC32-NEXT:    vlse64.v v31, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 264
+; RV32ZVBC32-NEXT:    vlse64.v v30, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 240
+; RV32ZVBC32-NEXT:    vlse64.v v29, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 232
+; RV32ZVBC32-NEXT:    vlse64.v v28, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 224
+; RV32ZVBC32-NEXT:    vlse64.v v27, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 216
+; RV32ZVBC32-NEXT:    vlse64.v v22, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 208
+; RV32ZVBC32-NEXT:    vlse64.v v19, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 200
+; RV32ZVBC32-NEXT:    vlse64.v v18, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 192
+; RV32ZVBC32-NEXT:    vlse64.v v23, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 184
+; RV32ZVBC32-NEXT:    vlse64.v v16, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 176
+; RV32ZVBC32-NEXT:    vlse64.v v14, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 168
+; RV32ZVBC32-NEXT:    vlse64.v v15, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 160
+; RV32ZVBC32-NEXT:    vlse64.v v17, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 152
+; RV32ZVBC32-NEXT:    vlse64.v v20, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 144
+; RV32ZVBC32-NEXT:    vlse64.v v25, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 136
+; RV32ZVBC32-NEXT:    vlse64.v v21, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 128
+; RV32ZVBC32-NEXT:    vlse64.v v24, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 120
+; RV32ZVBC32-NEXT:    vlse64.v v26, (a0), zero
+; RV32ZVBC32-NEXT:    li ra, 56
+; RV32ZVBC32-NEXT:    vsrl.vi v5, v9, 24
+; RV32ZVBC32-NEXT:    vsrl.vi v4, v9, 8
+; RV32ZVBC32-NEXT:    vsrl.vx v3, v9, ra
+; RV32ZVBC32-NEXT:    li a2, 40
+; RV32ZVBC32-NEXT:    vsrl.vx v2, v9, a2
+; RV32ZVBC32-NEXT:    lui a1, 4080
+; RV32ZVBC32-NEXT:    vand.vx v5, v5, a1
+; RV32ZVBC32-NEXT:    vsll.vx v1, v9, ra
+; RV32ZVBC32-NEXT:    addi a0, s0, -256
+; RV32ZVBC32-NEXT:    vand.vx v2, v2, a0
+; RV32ZVBC32-NEXT:    vor.vv v3, v2, v3
+; RV32ZVBC32-NEXT:    vand.vx v2, v9, a0
+; RV32ZVBC32-NEXT:    vsll.vx v2, v2, a2
+; RV32ZVBC32-NEXT:    vor.vv v2, v1, v2
+; RV32ZVBC32-NEXT:    vand.vx v1, v9, a1
+; RV32ZVBC32-NEXT:    vsll.vi v1, v1, 24
+; RV32ZVBC32-NEXT:    vand.vv v4, v4, v13
+; RV32ZVBC32-NEXT:    vand.vv v9, v9, v13
+; RV32ZVBC32-NEXT:    vor.vv v5, v4, v5
+; RV32ZVBC32-NEXT:    vsll.vi v9, v9, 8
+; RV32ZVBC32-NEXT:    vor.vv v5, v5, v3
+; RV32ZVBC32-NEXT:    vor.vv v9, v1, v9
+; RV32ZVBC32-NEXT:    vor.vv v9, v2, v9
+; RV32ZVBC32-NEXT:    vor.vv v9, v9, v5
+; RV32ZVBC32-NEXT:    vsrl.vi v5, v9, 4
+; RV32ZVBC32-NEXT:    vand.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vand.vv v5, v5, v10
+; RV32ZVBC32-NEXT:    vsll.vi v9, v9, 4
+; RV32ZVBC32-NEXT:    vor.vv v9, v5, v9
+; RV32ZVBC32-NEXT:    vsrl.vi v5, v9, 2
+; RV32ZVBC32-NEXT:    vand.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vand.vv v5, v5, v11
+; RV32ZVBC32-NEXT:    vsll.vi v9, v9, 2
+; RV32ZVBC32-NEXT:    vor.vv v9, v5, v9
+; RV32ZVBC32-NEXT:    vsrl.vi v5, v9, 1
+; RV32ZVBC32-NEXT:    vand.vv v9, v9, v12
+; RV32ZVBC32-NEXT:    vand.vv v5, v5, v12
+; RV32ZVBC32-NEXT:    vadd.vv v9, v9, v9
+; RV32ZVBC32-NEXT:    vor.vv v9, v5, v9
+; RV32ZVBC32-NEXT:    vand.vx v4, v9, s10
+; RV32ZVBC32-NEXT:    vsrl.vi v5, v8, 24
+; RV32ZVBC32-NEXT:    vsrl.vx v3, v8, ra
+; RV32ZVBC32-NEXT:    vsrl.vx v2, v8, a2
+; RV32ZVBC32-NEXT:    vsll.vx v1, v8, ra
+; RV32ZVBC32-NEXT:    vand.vx v2, v2, a0
+; RV32ZVBC32-NEXT:    vor.vv v3, v2, v3
+; RV32ZVBC32-NEXT:    vand.vx v2, v8, a0
+; RV32ZVBC32-NEXT:    vsll.vx v2, v2, a2
+; RV32ZVBC32-NEXT:    vor.vv v2, v1, v2
+; RV32ZVBC32-NEXT:    vsrl.vi v1, v8, 8
+; RV32ZVBC32-NEXT:    vand.vx v5, v5, a1
+; RV32ZVBC32-NEXT:    vand.vv v1, v1, v13
+; RV32ZVBC32-NEXT:    vor.vv v5, v1, v5
+; RV32ZVBC32-NEXT:    vand.vx v1, v8, a1
+; RV32ZVBC32-NEXT:    vsll.vi v1, v1, 24
+; RV32ZVBC32-NEXT:    vor.vv v5, v5, v3
+; RV32ZVBC32-NEXT:    vand.vv v3, v8, v13
+; RV32ZVBC32-NEXT:    vsll.vi v3, v3, 8
+; RV32ZVBC32-NEXT:    vor.vv v3, v1, v3
+; RV32ZVBC32-NEXT:    vand.vx v1, v9, s9
+; RV32ZVBC32-NEXT:    vor.vv v3, v2, v3
+; RV32ZVBC32-NEXT:    vor.vv v5, v3, v5
+; RV32ZVBC32-NEXT:    vsrl.vi v3, v5, 4
+; RV32ZVBC32-NEXT:    vand.vv v5, v5, v10
+; RV32ZVBC32-NEXT:    vand.vv v3, v3, v10
+; RV32ZVBC32-NEXT:    vsll.vi v5, v5, 4
+; RV32ZVBC32-NEXT:    vor.vv v5, v3, v5
+; RV32ZVBC32-NEXT:    vsrl.vi v3, v5, 2
+; RV32ZVBC32-NEXT:    vand.vv v5, v5, v11
+; RV32ZVBC32-NEXT:    vand.vv v3, v3, v11
+; RV32ZVBC32-NEXT:    vsll.vi v5, v5, 2
+; RV32ZVBC32-NEXT:    vor.vv v5, v3, v5
+; RV32ZVBC32-NEXT:    vsrl.vi v3, v5, 1
+; RV32ZVBC32-NEXT:    vand.vv v5, v5, v12
+; RV32ZVBC32-NEXT:    vand.vv v3, v3, v12
+; RV32ZVBC32-NEXT:    vadd.vv v5, v5, v5
+; RV32ZVBC32-NEXT:    vor.vv v5, v3, v5
+; RV32ZVBC32-NEXT:    vand.vi v3, v9, 2
+; RV32ZVBC32-NEXT:    vand.vi v2, v9, 1
+; RV32ZVBC32-NEXT:    vmul.vv v3, v5, v3
+; RV32ZVBC32-NEXT:    vmul.vv v2, v5, v2
+; RV32ZVBC32-NEXT:    vxor.vv v3, v2, v3
+; RV32ZVBC32-NEXT:    vand.vi v2, v9, 4
+; RV32ZVBC32-NEXT:    vmul.vv v2, v5, v2
+; RV32ZVBC32-NEXT:    vxor.vv v3, v3, v2
+; RV32ZVBC32-NEXT:    vand.vi v2, v9, 8
+; RV32ZVBC32-NEXT:    vmul.vv v2, v5, v2
+; RV32ZVBC32-NEXT:    vxor.vv v3, v3, v2
+; RV32ZVBC32-NEXT:    vand.vx v2, v9, s1
+; RV32ZVBC32-NEXT:    vmul.vv v4, v5, v4
+; RV32ZVBC32-NEXT:    vxor.vv v4, v3, v4
+; RV32ZVBC32-NEXT:    vand.vx v3, v9, s2
+; RV32ZVBC32-NEXT:    vmul.vv v1, v5, v1
+; RV32ZVBC32-NEXT:    vxor.vv v4, v4, v1
+; RV32ZVBC32-NEXT:    vand.vx v1, v9, s3
+; RV32ZVBC32-NEXT:    vmul.vv v2, v5, v2
+; RV32ZVBC32-NEXT:    vxor.vv v4, v4, v2
+; RV32ZVBC32-NEXT:    vand.vx v2, v9, s4
+; RV32ZVBC32-NEXT:    vmul.vv v3, v5, v3
+; RV32ZVBC32-NEXT:    vxor.vv v4, v4, v3
+; RV32ZVBC32-NEXT:    vand.vx v3, v9, s5
+; RV32ZVBC32-NEXT:    vmul.vv v1, v5, v1
+; RV32ZVBC32-NEXT:    vmul.vv v2, v5, v2
+; RV32ZVBC32-NEXT:    vxor.vv v1, v4, v1
+; RV32ZVBC32-NEXT:    vxor.vv v2, v1, v2
+; RV32ZVBC32-NEXT:    vand.vx v1, v9, s11
+; RV32ZVBC32-NEXT:    vmul.vv v3, v5, v3
+; RV32ZVBC32-NEXT:    vxor.vv v3, v2, v3
+; RV32ZVBC32-NEXT:    lui t6, 1
+; RV32ZVBC32-NEXT:    vand.vx v2, v9, t6
+; RV32ZVBC32-NEXT:    vmul.vv v1, v5, v1
+; RV32ZVBC32-NEXT:    vxor.vv v3, v3, v1
+; RV32ZVBC32-NEXT:    lui t6, 2
+; RV32ZVBC32-NEXT:    vand.vx v1, v9, t6
+; RV32ZVBC32-NEXT:    vmul.vv v2, v5, v2
+; RV32ZVBC32-NEXT:    vxor.vv v3, v3, v2
+; RV32ZVBC32-NEXT:    lui t6, 4
+; RV32ZVBC32-NEXT:    vand.vx v2, v9, t6
+; RV32ZVBC32-NEXT:    vmul.vv v1, v5, v1
+; RV32ZVBC32-NEXT:    vxor.vv v3, v3, v1
+; RV32ZVBC32-NEXT:    vand.vx v1, v9, t2
+; RV32ZVBC32-NEXT:    vmul.vv v2, v5, v2
+; RV32ZVBC32-NEXT:    vxor.vv v3, v3, v2
+; RV32ZVBC32-NEXT:    vand.vx v2, v9, s0
+; RV32ZVBC32-NEXT:    vmul.vv v1, v5, v1
+; RV32ZVBC32-NEXT:    vxor.vv v3, v3, v1
+; RV32ZVBC32-NEXT:    vand.vx v1, v9, t0
+; RV32ZVBC32-NEXT:    vmul.vv v2, v5, v2
+; RV32ZVBC32-NEXT:    vxor.vv v3, v3, v2
+; RV32ZVBC32-NEXT:    vand.vx v2, v9, a7
+; RV32ZVBC32-NEXT:    vmul.vv v1, v5, v1
+; RV32ZVBC32-NEXT:    vxor.vv v3, v3, v1
+; RV32ZVBC32-NEXT:    vand.vx v1, v9, a6
+; RV32ZVBC32-NEXT:    vmul.vv v2, v5, v2
+; RV32ZVBC32-NEXT:    vxor.vv v3, v3, v2
+; RV32ZVBC32-NEXT:    vand.vx v2, v9, t3
+; RV32ZVBC32-NEXT:    vmul.vv v1, v5, v1
+; RV32ZVBC32-NEXT:    vxor.vv v3, v3, v1
+; RV32ZVBC32-NEXT:    vand.vx v1, v9, t4
+; RV32ZVBC32-NEXT:    vmul.vv v2, v5, v2
+; RV32ZVBC32-NEXT:    vxor.vv v3, v3, v2
+; RV32ZVBC32-NEXT:    vand.vx v2, v9, a5
+; RV32ZVBC32-NEXT:    vmul.vv v1, v5, v1
+; RV32ZVBC32-NEXT:    vxor.vv v3, v3, v1
+; RV32ZVBC32-NEXT:    vand.vx v1, v9, a4
+; RV32ZVBC32-NEXT:    vmul.vv v2, v5, v2
+; RV32ZVBC32-NEXT:    vxor.vv v3, v3, v2
+; RV32ZVBC32-NEXT:    vand.vx v2, v9, a3
+; RV32ZVBC32-NEXT:    vmul.vv v1, v5, v1
+; RV32ZVBC32-NEXT:    vxor.vv v3, v3, v1
+; RV32ZVBC32-NEXT:    vand.vx v1, v9, t1
+; RV32ZVBC32-NEXT:    vmul.vv v2, v5, v2
+; RV32ZVBC32-NEXT:    vmul.vv v1, v5, v1
+; RV32ZVBC32-NEXT:    vxor.vv v2, v3, v2
+; RV32ZVBC32-NEXT:    vxor.vv v2, v2, v1
+; RV32ZVBC32-NEXT:    lui a3, 16384
+; RV32ZVBC32-NEXT:    vand.vx v1, v9, a3
+; RV32ZVBC32-NEXT:    vmul.vv v1, v5, v1
+; RV32ZVBC32-NEXT:    vxor.vv v2, v2, v1
+; RV32ZVBC32-NEXT:    vand.vx v1, v9, s7
+; RV32ZVBC32-NEXT:    vmul.vv v1, v5, v1
+; RV32ZVBC32-NEXT:    vxor.vv v2, v2, v1
+; RV32ZVBC32-NEXT:    vand.vx v1, v9, s6
+; RV32ZVBC32-NEXT:    vmul.vv v1, v5, v1
+; RV32ZVBC32-NEXT:    vxor.vv v2, v2, v1
+; RV32ZVBC32-NEXT:    vand.vx v1, v9, t5
+; RV32ZVBC32-NEXT:    vmul.vv v1, v5, v1
+; RV32ZVBC32-NEXT:    vxor.vv v2, v2, v1
+; RV32ZVBC32-NEXT:    vand.vx v1, v9, s8
+; RV32ZVBC32-NEXT:    addi a3, sp, 112
+; RV32ZVBC32-NEXT:    vmul.vv v1, v5, v1
+; RV32ZVBC32-NEXT:    vxor.vv v1, v2, v1
+; RV32ZVBC32-NEXT:    vlse64.v v2, (a3), zero
+; RV32ZVBC32-NEXT:    addi a3, sp, 104
+; RV32ZVBC32-NEXT:    vand.vv v6, v9, v6
+; RV32ZVBC32-NEXT:    vmul.vv v6, v5, v6
+; RV32ZVBC32-NEXT:    vxor.vv v1, v1, v6
+; RV32ZVBC32-NEXT:    vlse64.v v6, (a3), zero
+; RV32ZVBC32-NEXT:    addi a3, sp, 96
+; RV32ZVBC32-NEXT:    vand.vv v7, v9, v7
+; RV32ZVBC32-NEXT:    vmul.vv v7, v5, v7
+; RV32ZVBC32-NEXT:    vxor.vv v1, v1, v7
+; RV32ZVBC32-NEXT:    vlse64.v v7, (a3), zero
+; RV32ZVBC32-NEXT:    addi a3, sp, 88
+; RV32ZVBC32-NEXT:    vand.vv v31, v9, v31
+; RV32ZVBC32-NEXT:    vmul.vv v31, v5, v31
+; RV32ZVBC32-NEXT:    vxor.vv v1, v1, v31
+; RV32ZVBC32-NEXT:    vlse64.v v31, (a3), zero
+; RV32ZVBC32-NEXT:    addi a3, sp, 80
+; RV32ZVBC32-NEXT:    vand.vv v30, v9, v30
+; RV32ZVBC32-NEXT:    vmul.vv v30, v5, v30
+; RV32ZVBC32-NEXT:    vxor.vv v1, v1, v30
+; RV32ZVBC32-NEXT:    vlse64.v v30, (a3), zero
+; RV32ZVBC32-NEXT:    addi a3, sp, 72
+; RV32ZVBC32-NEXT:    vand.vv v29, v9, v29
+; RV32ZVBC32-NEXT:    vmul.vv v29, v5, v29
+; RV32ZVBC32-NEXT:    vxor.vv v1, v1, v29
+; RV32ZVBC32-NEXT:    vlse64.v v29, (a3), zero
+; RV32ZVBC32-NEXT:    addi a3, sp, 64
+; RV32ZVBC32-NEXT:    vand.vv v28, v9, v28
+; RV32ZVBC32-NEXT:    vmul.vv v28, v5, v28
+; RV32ZVBC32-NEXT:    vxor.vv v1, v1, v28
+; RV32ZVBC32-NEXT:    vlse64.v v28, (a3), zero
+; RV32ZVBC32-NEXT:    addi a3, sp, 56
+; RV32ZVBC32-NEXT:    vand.vv v27, v9, v27
+; RV32ZVBC32-NEXT:    vmul.vv v27, v5, v27
+; RV32ZVBC32-NEXT:    vxor.vv v1, v1, v27
+; RV32ZVBC32-NEXT:    vlse64.v v27, (a3), zero
+; RV32ZVBC32-NEXT:    addi a3, sp, 48
+; RV32ZVBC32-NEXT:    vand.vv v22, v9, v22
+; RV32ZVBC32-NEXT:    vmul.vv v22, v5, v22
+; RV32ZVBC32-NEXT:    vxor.vv v1, v1, v22
+; RV32ZVBC32-NEXT:    vlse64.v v22, (a3), zero
+; RV32ZVBC32-NEXT:    addi a3, sp, 40
+; RV32ZVBC32-NEXT:    vand.vv v19, v9, v19
+; RV32ZVBC32-NEXT:    vmul.vv v19, v5, v19
+; RV32ZVBC32-NEXT:    vxor.vv v19, v1, v19
+; RV32ZVBC32-NEXT:    vlse64.v v1, (a3), zero
+; RV32ZVBC32-NEXT:    addi a3, sp, 32
+; RV32ZVBC32-NEXT:    vand.vv v18, v9, v18
+; RV32ZVBC32-NEXT:    vand.vv v23, v9, v23
+; RV32ZVBC32-NEXT:    vmul.vv v18, v5, v18
+; RV32ZVBC32-NEXT:    vmul.vv v23, v5, v23
+; RV32ZVBC32-NEXT:    vxor.vv v18, v19, v18
+; RV32ZVBC32-NEXT:    vxor.vv v23, v18, v23
+; RV32ZVBC32-NEXT:    vlse64.v v18, (a3), zero
+; RV32ZVBC32-NEXT:    addi a3, sp, 24
+; RV32ZVBC32-NEXT:    vand.vv v16, v9, v16
+; RV32ZVBC32-NEXT:    vmul.vv v16, v5, v16
+; RV32ZVBC32-NEXT:    vxor.vv v16, v23, v16
+; RV32ZVBC32-NEXT:    vlse64.v v23, (a3), zero
+; RV32ZVBC32-NEXT:    addi a3, sp, 16
+; RV32ZVBC32-NEXT:    vand.vv v14, v9, v14
+; RV32ZVBC32-NEXT:    vmul.vv v14, v5, v14
+; RV32ZVBC32-NEXT:    vxor.vv v14, v16, v14
+; RV32ZVBC32-NEXT:    vlse64.v v16, (a3), zero
+; RV32ZVBC32-NEXT:    vand.vv v15, v9, v15
+; RV32ZVBC32-NEXT:    vmul.vv v15, v5, v15
+; RV32ZVBC32-NEXT:    vxor.vv v14, v14, v15
+; RV32ZVBC32-NEXT:    vand.vv v15, v9, v17
+; RV32ZVBC32-NEXT:    vmul.vv v15, v5, v15
+; RV32ZVBC32-NEXT:    vxor.vv v14, v14, v15
+; RV32ZVBC32-NEXT:    vand.vv v15, v9, v20
+; RV32ZVBC32-NEXT:    vand.vv v17, v9, v25
+; RV32ZVBC32-NEXT:    vmul.vv v15, v5, v15
+; RV32ZVBC32-NEXT:    vmul.vv v17, v5, v17
+; RV32ZVBC32-NEXT:    vxor.vv v14, v14, v15
+; RV32ZVBC32-NEXT:    vand.vx v15, v19, a1
+; RV32ZVBC32-NEXT:    vxor.vv v14, v14, v17
+; RV32ZVBC32-NEXT:    vsrl.vi v17, v14, 24
+; RV32ZVBC32-NEXT:    vand.vx v17, v17, a1
+; RV32ZVBC32-NEXT:    vand.vv v20, v9, v21
+; RV32ZVBC32-NEXT:    vmul.vv v20, v5, v20
+; RV32ZVBC32-NEXT:    vxor.vv v14, v14, v20
+; RV32ZVBC32-NEXT:    vand.vv v20, v9, v24
+; RV32ZVBC32-NEXT:    vmul.vv v20, v5, v20
+; RV32ZVBC32-NEXT:    vxor.vv v14, v14, v20
+; RV32ZVBC32-NEXT:    vand.vv v20, v9, v26
+; RV32ZVBC32-NEXT:    vmul.vv v20, v5, v20
+; RV32ZVBC32-NEXT:    vxor.vv v14, v14, v20
+; RV32ZVBC32-NEXT:    vand.vv v20, v9, v2
+; RV32ZVBC32-NEXT:    vmul.vv v20, v5, v20
+; RV32ZVBC32-NEXT:    vxor.vv v14, v14, v20
+; RV32ZVBC32-NEXT:    vand.vv v20, v9, v6
+; RV32ZVBC32-NEXT:    vmul.vv v20, v5, v20
+; RV32ZVBC32-NEXT:    vxor.vv v14, v14, v20
+; RV32ZVBC32-NEXT:    vand.vv v20, v9, v7
+; RV32ZVBC32-NEXT:    vmul.vv v20, v5, v20
+; RV32ZVBC32-NEXT:    vxor.vv v14, v14, v20
+; RV32ZVBC32-NEXT:    vand.vv v20, v9, v31
+; RV32ZVBC32-NEXT:    vmul.vv v20, v5, v20
+; RV32ZVBC32-NEXT:    vxor.vv v14, v14, v20
+; RV32ZVBC32-NEXT:    vand.vv v20, v9, v30
+; RV32ZVBC32-NEXT:    vmul.vv v20, v5, v20
+; RV32ZVBC32-NEXT:    vand.vx v21, v3, a0
+; RV32ZVBC32-NEXT:    vsll.vx v21, v21, a2
+; RV32ZVBC32-NEXT:    vxor.vv v14, v14, v20
+; RV32ZVBC32-NEXT:    vsrl.vx v20, v14, a2
+; RV32ZVBC32-NEXT:    vand.vx v20, v20, a0
+; RV32ZVBC32-NEXT:    vand.vv v24, v9, v29
+; RV32ZVBC32-NEXT:    vand.vv v25, v9, v28
+; RV32ZVBC32-NEXT:    vand.vv v26, v9, v27
+; RV32ZVBC32-NEXT:    vand.vv v22, v9, v22
+; RV32ZVBC32-NEXT:    vand.vv v27, v9, v1
+; RV32ZVBC32-NEXT:    vand.vv v18, v9, v18
+; RV32ZVBC32-NEXT:    vand.vv v23, v9, v23
+; RV32ZVBC32-NEXT:    vand.vv v9, v9, v16
+; RV32ZVBC32-NEXT:    vmul.vv v16, v5, v24
+; RV32ZVBC32-NEXT:    vmul.vv v24, v5, v25
+; RV32ZVBC32-NEXT:    vmul.vv v25, v5, v26
+; RV32ZVBC32-NEXT:    vmul.vv v22, v5, v22
+; RV32ZVBC32-NEXT:    vmul.vv v26, v5, v27
+; RV32ZVBC32-NEXT:    vmul.vv v18, v5, v18
+; RV32ZVBC32-NEXT:    vmul.vv v23, v5, v23
+; RV32ZVBC32-NEXT:    vmul.vv v9, v5, v9
+; RV32ZVBC32-NEXT:    vxor.vv v16, v14, v16
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v25
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v22
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v26
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v18
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v23
+; RV32ZVBC32-NEXT:    vxor.vv v9, v16, v9
+; RV32ZVBC32-NEXT:    vsll.vx v16, v4, ra
+; RV32ZVBC32-NEXT:    vsrl.vx v9, v9, ra
+; RV32ZVBC32-NEXT:    vor.vv v16, v16, v21
+; RV32ZVBC32-NEXT:    vsrl.vi v18, v19, 8
+; RV32ZVBC32-NEXT:    vsll.vi v15, v15, 24
+; RV32ZVBC32-NEXT:    vand.vv v18, v18, v13
+; RV32ZVBC32-NEXT:    vor.vv v17, v18, v17
+; RV32ZVBC32-NEXT:    vand.vv v13, v14, v13
+; RV32ZVBC32-NEXT:    vsll.vi v13, v13, 8
+; RV32ZVBC32-NEXT:    vor.vv v13, v15, v13
+; RV32ZVBC32-NEXT:    vor.vv v13, v16, v13
+; RV32ZVBC32-NEXT:    vor.vv v9, v20, v9
+; RV32ZVBC32-NEXT:    vor.vv v9, v17, v9
+; RV32ZVBC32-NEXT:    vor.vv v9, v13, v9
+; RV32ZVBC32-NEXT:    vsrl.vi v13, v9, 4
+; RV32ZVBC32-NEXT:    vand.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vand.vv v10, v13, v10
+; RV32ZVBC32-NEXT:    vsll.vi v9, v9, 4
+; RV32ZVBC32-NEXT:    vor.vv v9, v10, v9
+; RV32ZVBC32-NEXT:    vsrl.vi v10, v9, 2
+; RV32ZVBC32-NEXT:    vand.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vand.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vsll.vi v9, v9, 2
+; RV32ZVBC32-NEXT:    vor.vv v9, v10, v9
+; RV32ZVBC32-NEXT:    vsrl.vi v10, v9, 1
+; RV32ZVBC32-NEXT:    vand.vv v9, v9, v12
+; RV32ZVBC32-NEXT:    vand.vv v10, v10, v12
+; RV32ZVBC32-NEXT:    vadd.vv v9, v9, v9
+; RV32ZVBC32-NEXT:    vor.vv v9, v10, v9
+; RV32ZVBC32-NEXT:    vsrl.vi v8, v9, 1, v0.t
+; RV32ZVBC32-NEXT:    lw ra, 332(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s0, 328(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s1, 324(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s2, 320(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s3, 316(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s4, 312(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s5, 308(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s6, 304(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s7, 300(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s8, 296(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s9, 292(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s10, 288(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s11, 284(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    .cfi_restore ra
+; RV32ZVBC32-NEXT:    .cfi_restore s0
+; RV32ZVBC32-NEXT:    .cfi_restore s1
+; RV32ZVBC32-NEXT:    .cfi_restore s2
+; RV32ZVBC32-NEXT:    .cfi_restore s3
+; RV32ZVBC32-NEXT:    .cfi_restore s4
+; RV32ZVBC32-NEXT:    .cfi_restore s5
+; RV32ZVBC32-NEXT:    .cfi_restore s6
+; RV32ZVBC32-NEXT:    .cfi_restore s7
+; RV32ZVBC32-NEXT:    .cfi_restore s8
+; RV32ZVBC32-NEXT:    .cfi_restore s9
+; RV32ZVBC32-NEXT:    .cfi_restore s10
+; RV32ZVBC32-NEXT:    .cfi_restore s11
+; RV32ZVBC32-NEXT:    addi sp, sp, 336
+; RV32ZVBC32-NEXT:    .cfi_def_cfa_offset 0
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmulh_nxv1i64_vv_mask:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    addi sp, sp, -224
+; RV64ZVBC32-NEXT:    .cfi_def_cfa_offset 224
+; RV64ZVBC32-NEXT:    sd ra, 216(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    sd s0, 208(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    sd s1, 200(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    sd s2, 192(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    sd s3, 184(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    sd s4, 176(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    sd s5, 168(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    sd s6, 160(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    sd s7, 152(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    sd s8, 144(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    sd s9, 136(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    sd s10, 128(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    sd s11, 120(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    .cfi_offset ra, -8
+; RV64ZVBC32-NEXT:    .cfi_offset s0, -16
+; RV64ZVBC32-NEXT:    .cfi_offset s1, -24
+; RV64ZVBC32-NEXT:    .cfi_offset s2, -32
+; RV64ZVBC32-NEXT:    .cfi_offset s3, -40
+; RV64ZVBC32-NEXT:    .cfi_offset s4, -48
+; RV64ZVBC32-NEXT:    .cfi_offset s5, -56
+; RV64ZVBC32-NEXT:    .cfi_offset s6, -64
+; RV64ZVBC32-NEXT:    .cfi_offset s7, -72
+; RV64ZVBC32-NEXT:    .cfi_offset s8, -80
+; RV64ZVBC32-NEXT:    .cfi_offset s9, -88
+; RV64ZVBC32-NEXT:    .cfi_offset s10, -96
+; RV64ZVBC32-NEXT:    .cfi_offset s11, -104
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    sub sp, sp, a0
+; RV64ZVBC32-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xe0, 0x01, 0x22, 0x11, 0x2f, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 224 + 47 * vlenb
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vs1r.v v0, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
+; RV64ZVBC32-NEXT:    vmv1r.v v18, v8
+; RV64ZVBC32-NEXT:    li a3, 56
+; RV64ZVBC32-NEXT:    lui a2, 16
+; RV64ZVBC32-NEXT:    vsrl.vi v16, v8, 24
+; RV64ZVBC32-NEXT:    vsrl.vi v10, v8, 8
+; RV64ZVBC32-NEXT:    li t5, 255
+; RV64ZVBC32-NEXT:    lui a0, 61681
+; RV64ZVBC32-NEXT:    lui a1, 209715
+; RV64ZVBC32-NEXT:    lui a5, 349525
+; RV64ZVBC32-NEXT:    vsrl.vi v12, v9, 24
+; RV64ZVBC32-NEXT:    vsrl.vi v11, v9, 8
+; RV64ZVBC32-NEXT:    li ra, 16
+; RV64ZVBC32-NEXT:    li s11, 32
+; RV64ZVBC32-NEXT:    li s10, 64
+; RV64ZVBC32-NEXT:    li s8, 128
+; RV64ZVBC32-NEXT:    li s9, 256
+; RV64ZVBC32-NEXT:    li s7, 512
+; RV64ZVBC32-NEXT:    li s6, 1024
+; RV64ZVBC32-NEXT:    li t0, 1
+; RV64ZVBC32-NEXT:    lui s5, 1
+; RV64ZVBC32-NEXT:    lui a6, 2
+; RV64ZVBC32-NEXT:    lui a7, 4
+; RV64ZVBC32-NEXT:    lui t1, 8
+; RV64ZVBC32-NEXT:    lui t2, 32
+; RV64ZVBC32-NEXT:    lui t3, 64
+; RV64ZVBC32-NEXT:    lui t4, 128
+; RV64ZVBC32-NEXT:    lui s3, 256
+; RV64ZVBC32-NEXT:    lui s4, 512
+; RV64ZVBC32-NEXT:    addi s0, a0, -241
+; RV64ZVBC32-NEXT:    addi s1, a1, 819
+; RV64ZVBC32-NEXT:    addi s2, a5, 1365
+; RV64ZVBC32-NEXT:    slli a0, s0, 32
+; RV64ZVBC32-NEXT:    add s0, s0, a0
+; RV64ZVBC32-NEXT:    slli a0, s1, 32
+; RV64ZVBC32-NEXT:    add s1, s1, a0
+; RV64ZVBC32-NEXT:    slli a0, s2, 32
+; RV64ZVBC32-NEXT:    add s2, s2, a0
+; RV64ZVBC32-NEXT:    addi t6, a2, -256
+; RV64ZVBC32-NEXT:    slli t5, t5, 24
+; RV64ZVBC32-NEXT:    vsrl.vx v8, v9, a3
+; RV64ZVBC32-NEXT:    li a0, 40
+; RV64ZVBC32-NEXT:    vsrl.vx v13, v9, a0
+; RV64ZVBC32-NEXT:    lui a1, 4080
+; RV64ZVBC32-NEXT:    vand.vx v12, v12, a1
+; RV64ZVBC32-NEXT:    vand.vx v14, v9, a1
+; RV64ZVBC32-NEXT:    vsll.vx v15, v9, a3
+; RV64ZVBC32-NEXT:    vand.vx v13, v13, t6
+; RV64ZVBC32-NEXT:    vand.vx v11, v11, t5
+; RV64ZVBC32-NEXT:    vsll.vi v14, v14, 24
+; RV64ZVBC32-NEXT:    vand.vx v17, v9, t5
+; RV64ZVBC32-NEXT:    vand.vx v9, v9, t6
+; RV64ZVBC32-NEXT:    vor.vv v8, v13, v8
+; RV64ZVBC32-NEXT:    vor.vv v11, v11, v12
+; RV64ZVBC32-NEXT:    vsll.vi v12, v17, 8
+; RV64ZVBC32-NEXT:    vsll.vx v9, v9, a0
+; RV64ZVBC32-NEXT:    li a4, 40
+; RV64ZVBC32-NEXT:    vor.vv v8, v11, v8
+; RV64ZVBC32-NEXT:    vor.vv v11, v14, v12
+; RV64ZVBC32-NEXT:    vor.vv v9, v15, v9
+; RV64ZVBC32-NEXT:    vor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vor.vv v8, v9, v8
+; RV64ZVBC32-NEXT:    vsrl.vi v9, v8, 4
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, s0
+; RV64ZVBC32-NEXT:    vand.vx v9, v9, s0
+; RV64ZVBC32-NEXT:    vsll.vi v8, v8, 4
+; RV64ZVBC32-NEXT:    vor.vv v8, v9, v8
+; RV64ZVBC32-NEXT:    vsrl.vi v9, v8, 2
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, s1
+; RV64ZVBC32-NEXT:    vand.vx v9, v9, s1
+; RV64ZVBC32-NEXT:    vsll.vi v8, v8, 2
+; RV64ZVBC32-NEXT:    vor.vv v8, v9, v8
+; RV64ZVBC32-NEXT:    vsrl.vi v9, v8, 1
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, s2
+; RV64ZVBC32-NEXT:    vand.vx v9, v9, s2
+; RV64ZVBC32-NEXT:    vadd.vv v8, v8, v8
+; RV64ZVBC32-NEXT:    vor.vv v11, v9, v8
+; RV64ZVBC32-NEXT:    vand.vx v13, v11, ra
+; RV64ZVBC32-NEXT:    lui a0, 4096
+; RV64ZVBC32-NEXT:    vand.vx v14, v11, s11
+; RV64ZVBC32-NEXT:    lui a1, 8192
+; RV64ZVBC32-NEXT:    vand.vx v15, v11, s10
+; RV64ZVBC32-NEXT:    lui a3, 16384
+; RV64ZVBC32-NEXT:    vand.vx v17, v11, s8
+; RV64ZVBC32-NEXT:    lui s8, 32768
+; RV64ZVBC32-NEXT:    vand.vx v19, v11, s9
+; RV64ZVBC32-NEXT:    lui s9, 65536
+; RV64ZVBC32-NEXT:    vand.vx v20, v11, s7
+; RV64ZVBC32-NEXT:    lui s11, 131072
+; RV64ZVBC32-NEXT:    vand.vx v21, v11, s6
+; RV64ZVBC32-NEXT:    slli a5, t0, 11
+; RV64ZVBC32-NEXT:    vand.vx v22, v11, a5
+; RV64ZVBC32-NEXT:    lui ra, 262144
+; RV64ZVBC32-NEXT:    li a5, 56
+; RV64ZVBC32-NEXT:    vsrl.vx v5, v18, a5
+; RV64ZVBC32-NEXT:    vsrl.vx v1, v18, a4
+; RV64ZVBC32-NEXT:    lui s6, 4080
+; RV64ZVBC32-NEXT:    vand.vx v2, v16, s6
+; RV64ZVBC32-NEXT:    vand.vx v8, v18, s6
+; RV64ZVBC32-NEXT:    vsll.vx v4, v18, a5
+; RV64ZVBC32-NEXT:    vand.vx v23, v11, s5
+; RV64ZVBC32-NEXT:    slli s10, t0, 31
+; RV64ZVBC32-NEXT:    vand.vx v24, v11, a6
+; RV64ZVBC32-NEXT:    slli a5, t0, 32
+; RV64ZVBC32-NEXT:    sd a5, 96(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v25, v11, a7
+; RV64ZVBC32-NEXT:    slli a5, t0, 33
+; RV64ZVBC32-NEXT:    sd a5, 88(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v26, v11, t1
+; RV64ZVBC32-NEXT:    slli a5, t0, 34
+; RV64ZVBC32-NEXT:    sd a5, 80(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v27, v11, a2
+; RV64ZVBC32-NEXT:    slli a2, t0, 35
+; RV64ZVBC32-NEXT:    sd a2, 72(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v28, v11, t2
+; RV64ZVBC32-NEXT:    slli a2, t0, 36
+; RV64ZVBC32-NEXT:    sd a2, 64(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v29, v11, t3
+; RV64ZVBC32-NEXT:    slli a2, t0, 37
+; RV64ZVBC32-NEXT:    sd a2, 56(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v30, v11, t4
+; RV64ZVBC32-NEXT:    slli a2, t0, 38
+; RV64ZVBC32-NEXT:    sd a2, 48(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v31, v11, s3
+; RV64ZVBC32-NEXT:    slli a2, t0, 39
+; RV64ZVBC32-NEXT:    sd a2, 40(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v7, v11, s4
+; RV64ZVBC32-NEXT:    slli a2, t0, 40
+; RV64ZVBC32-NEXT:    sd a2, 32(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    lui a2, 1024
+; RV64ZVBC32-NEXT:    vand.vx v9, v11, a2
+; RV64ZVBC32-NEXT:    csrr a2, vlenb
+; RV64ZVBC32-NEXT:    slli a2, a2, 2
+; RV64ZVBC32-NEXT:    mv a5, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 1
+; RV64ZVBC32-NEXT:    add a5, a5, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 2
+; RV64ZVBC32-NEXT:    add a2, a2, a5
+; RV64ZVBC32-NEXT:    add a2, sp, a2
+; RV64ZVBC32-NEXT:    addi a2, a2, 112
+; RV64ZVBC32-NEXT:    vs1r.v v9, (a2) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli a2, t0, 41
+; RV64ZVBC32-NEXT:    sd a2, 24(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    lui a2, 2048
+; RV64ZVBC32-NEXT:    vand.vx v9, v11, a2
+; RV64ZVBC32-NEXT:    csrr a2, vlenb
+; RV64ZVBC32-NEXT:    mv a5, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 1
+; RV64ZVBC32-NEXT:    add a5, a5, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 2
+; RV64ZVBC32-NEXT:    add a5, a5, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 2
+; RV64ZVBC32-NEXT:    add a2, a2, a5
+; RV64ZVBC32-NEXT:    add a2, sp, a2
+; RV64ZVBC32-NEXT:    addi a2, a2, 112
+; RV64ZVBC32-NEXT:    vs1r.v v9, (a2) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli s4, t0, 42
+; RV64ZVBC32-NEXT:    vand.vx v9, v11, a0
+; RV64ZVBC32-NEXT:    sd s4, 8(sp) # 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a2, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a2, a2, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a2
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli s5, t0, 43
+; RV64ZVBC32-NEXT:    vand.vx v9, v11, a1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli s6, t0, 44
+; RV64ZVBC32-NEXT:    vand.vx v9, v11, a3
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli s7, t0, 45
+; RV64ZVBC32-NEXT:    vand.vx v9, v1, t6
+; RV64ZVBC32-NEXT:    vor.vv v9, v9, v5
+; RV64ZVBC32-NEXT:    vand.vx v12, v11, s8
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vs1r.v v12, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli s8, t0, 46
+; RV64ZVBC32-NEXT:    vand.vx v10, v10, t5
+; RV64ZVBC32-NEXT:    vor.vv v10, v10, v2
+; RV64ZVBC32-NEXT:    vand.vx v12, v11, s9
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vs1r.v v12, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli s9, t0, 47
+; RV64ZVBC32-NEXT:    vsll.vi v8, v8, 24
+; RV64ZVBC32-NEXT:    vor.vv v9, v10, v9
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vs1r.v v18, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v10, v18, t5
+; RV64ZVBC32-NEXT:    vsll.vi v10, v10, 8
+; RV64ZVBC32-NEXT:    vor.vv v8, v8, v10
+; RV64ZVBC32-NEXT:    vand.vx v10, v18, t6
+; RV64ZVBC32-NEXT:    vsll.vx v10, v10, a4
+; RV64ZVBC32-NEXT:    vor.vv v10, v4, v10
+; RV64ZVBC32-NEXT:    vand.vx v12, v11, s11
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vs1r.v v12, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli s11, t0, 48
+; RV64ZVBC32-NEXT:    vor.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    vand.vx v10, v11, ra
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli ra, t0, 49
+; RV64ZVBC32-NEXT:    vor.vv v8, v8, v9
+; RV64ZVBC32-NEXT:    vsrl.vi v9, v8, 4
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, s0
+; RV64ZVBC32-NEXT:    vand.vx v9, v9, s0
+; RV64ZVBC32-NEXT:    vsll.vi v8, v8, 4
+; RV64ZVBC32-NEXT:    vor.vv v8, v9, v8
+; RV64ZVBC32-NEXT:    vsrl.vi v9, v8, 2
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, s1
+; RV64ZVBC32-NEXT:    vand.vx v9, v9, s1
+; RV64ZVBC32-NEXT:    vsll.vi v8, v8, 2
+; RV64ZVBC32-NEXT:    vor.vv v8, v9, v8
+; RV64ZVBC32-NEXT:    vsrl.vi v9, v8, 1
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, s2
+; RV64ZVBC32-NEXT:    vand.vx v9, v9, s2
+; RV64ZVBC32-NEXT:    vadd.vv v8, v8, v8
+; RV64ZVBC32-NEXT:    vor.vv v4, v9, v8
+; RV64ZVBC32-NEXT:    vand.vx v8, v11, s10
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    slli t4, t0, 50
+; RV64ZVBC32-NEXT:    slli t3, t0, 51
+; RV64ZVBC32-NEXT:    slli t2, t0, 52
+; RV64ZVBC32-NEXT:    slli s10, t0, 53
+; RV64ZVBC32-NEXT:    slli t1, t0, 54
+; RV64ZVBC32-NEXT:    slli a7, t0, 55
+; RV64ZVBC32-NEXT:    slli a6, t0, 56
+; RV64ZVBC32-NEXT:    slli a5, t0, 57
+; RV64ZVBC32-NEXT:    slli a4, t0, 58
+; RV64ZVBC32-NEXT:    slli a2, t0, 59
+; RV64ZVBC32-NEXT:    slli a1, t0, 60
+; RV64ZVBC32-NEXT:    slli a3, t0, 61
+; RV64ZVBC32-NEXT:    slli t0, t0, 62
+; RV64ZVBC32-NEXT:    li a0, -1
+; RV64ZVBC32-NEXT:    slli a0, a0, 63
+; RV64ZVBC32-NEXT:    ld s3, 96(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v8, v11, s3
+; RV64ZVBC32-NEXT:    csrr s3, vlenb
+; RV64ZVBC32-NEXT:    slli s3, s3, 1
+; RV64ZVBC32-NEXT:    mv s4, s3
+; RV64ZVBC32-NEXT:    slli s3, s3, 4
+; RV64ZVBC32-NEXT:    add s3, s3, s4
+; RV64ZVBC32-NEXT:    add s3, sp, s3
+; RV64ZVBC32-NEXT:    addi s3, s3, 112
+; RV64ZVBC32-NEXT:    vs1r.v v8, (s3) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld s3, 88(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v8, v11, s3
+; RV64ZVBC32-NEXT:    csrr s3, vlenb
+; RV64ZVBC32-NEXT:    slli s4, s3, 5
+; RV64ZVBC32-NEXT:    add s3, s4, s3
+; RV64ZVBC32-NEXT:    add s3, sp, s3
+; RV64ZVBC32-NEXT:    addi s3, s3, 112
+; RV64ZVBC32-NEXT:    vs1r.v v8, (s3) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld s3, 80(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v8, v11, s3
+; RV64ZVBC32-NEXT:    csrr s3, vlenb
+; RV64ZVBC32-NEXT:    slli s3, s3, 5
+; RV64ZVBC32-NEXT:    add s3, sp, s3
+; RV64ZVBC32-NEXT:    addi s3, s3, 112
+; RV64ZVBC32-NEXT:    vs1r.v v8, (s3) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld s3, 72(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v8, v11, s3
+; RV64ZVBC32-NEXT:    csrr s3, vlenb
+; RV64ZVBC32-NEXT:    slli s4, s3, 5
+; RV64ZVBC32-NEXT:    sub s3, s4, s3
+; RV64ZVBC32-NEXT:    add s3, sp, s3
+; RV64ZVBC32-NEXT:    addi s3, s3, 112
+; RV64ZVBC32-NEXT:    vs1r.v v8, (s3) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld s3, 64(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v8, v11, s3
+; RV64ZVBC32-NEXT:    csrr s3, vlenb
+; RV64ZVBC32-NEXT:    slli s3, s3, 1
+; RV64ZVBC32-NEXT:    mv s4, s3
+; RV64ZVBC32-NEXT:    slli s3, s3, 1
+; RV64ZVBC32-NEXT:    add s4, s4, s3
+; RV64ZVBC32-NEXT:    slli s3, s3, 1
+; RV64ZVBC32-NEXT:    add s4, s4, s3
+; RV64ZVBC32-NEXT:    slli s3, s3, 1
+; RV64ZVBC32-NEXT:    add s3, s3, s4
+; RV64ZVBC32-NEXT:    add s3, sp, s3
+; RV64ZVBC32-NEXT:    addi s3, s3, 112
+; RV64ZVBC32-NEXT:    vs1r.v v8, (s3) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld s3, 56(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v8, v11, s3
+; RV64ZVBC32-NEXT:    csrr s3, vlenb
+; RV64ZVBC32-NEXT:    mv s4, s3
+; RV64ZVBC32-NEXT:    slli s3, s3, 2
+; RV64ZVBC32-NEXT:    add s4, s4, s3
+; RV64ZVBC32-NEXT:    slli s3, s3, 1
+; RV64ZVBC32-NEXT:    add s4, s4, s3
+; RV64ZVBC32-NEXT:    slli s3, s3, 1
+; RV64ZVBC32-NEXT:    add s3, s3, s4
+; RV64ZVBC32-NEXT:    add s3, sp, s3
+; RV64ZVBC32-NEXT:    addi s3, s3, 112
+; RV64ZVBC32-NEXT:    vs1r.v v8, (s3) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld s3, 48(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v8, v11, s3
+; RV64ZVBC32-NEXT:    csrr s3, vlenb
+; RV64ZVBC32-NEXT:    slli s3, s3, 2
+; RV64ZVBC32-NEXT:    mv s4, s3
+; RV64ZVBC32-NEXT:    slli s3, s3, 1
+; RV64ZVBC32-NEXT:    add s4, s4, s3
+; RV64ZVBC32-NEXT:    slli s3, s3, 1
+; RV64ZVBC32-NEXT:    add s3, s3, s4
+; RV64ZVBC32-NEXT:    add s3, sp, s3
+; RV64ZVBC32-NEXT:    addi s3, s3, 112
+; RV64ZVBC32-NEXT:    vs1r.v v8, (s3) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld s3, 40(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v8, v11, s3
+; RV64ZVBC32-NEXT:    csrr s3, vlenb
+; RV64ZVBC32-NEXT:    mv s4, s3
+; RV64ZVBC32-NEXT:    slli s3, s3, 1
+; RV64ZVBC32-NEXT:    add s4, s4, s3
+; RV64ZVBC32-NEXT:    slli s3, s3, 2
+; RV64ZVBC32-NEXT:    add s4, s4, s3
+; RV64ZVBC32-NEXT:    slli s3, s3, 1
+; RV64ZVBC32-NEXT:    add s3, s3, s4
+; RV64ZVBC32-NEXT:    add s3, sp, s3
+; RV64ZVBC32-NEXT:    addi s3, s3, 112
+; RV64ZVBC32-NEXT:    vs1r.v v8, (s3) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld s3, 32(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v8, v11, s3
+; RV64ZVBC32-NEXT:    csrr s3, vlenb
+; RV64ZVBC32-NEXT:    slli s3, s3, 1
+; RV64ZVBC32-NEXT:    mv s4, s3
+; RV64ZVBC32-NEXT:    slli s3, s3, 2
+; RV64ZVBC32-NEXT:    add s4, s4, s3
+; RV64ZVBC32-NEXT:    slli s3, s3, 1
+; RV64ZVBC32-NEXT:    add s3, s3, s4
+; RV64ZVBC32-NEXT:    add s3, sp, s3
+; RV64ZVBC32-NEXT:    addi s3, s3, 112
+; RV64ZVBC32-NEXT:    vs1r.v v8, (s3) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    ld s3, 24(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vand.vx v8, v11, s3
+; RV64ZVBC32-NEXT:    csrr s3, vlenb
+; RV64ZVBC32-NEXT:    mv s4, s3
+; RV64ZVBC32-NEXT:    slli s3, s3, 3
+; RV64ZVBC32-NEXT:    add s4, s4, s3
+; RV64ZVBC32-NEXT:    slli s3, s3, 1
+; RV64ZVBC32-NEXT:    add s3, s3, s4
+; RV64ZVBC32-NEXT:    ld s4, 8(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    add s3, sp, s3
+; RV64ZVBC32-NEXT:    addi s3, s3, 112
+; RV64ZVBC32-NEXT:    vs1r.v v8, (s3) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v8, v11, s4
+; RV64ZVBC32-NEXT:    csrr s3, vlenb
+; RV64ZVBC32-NEXT:    slli s3, s3, 3
+; RV64ZVBC32-NEXT:    mv s4, s3
+; RV64ZVBC32-NEXT:    slli s3, s3, 1
+; RV64ZVBC32-NEXT:    add s3, s3, s4
+; RV64ZVBC32-NEXT:    add s3, sp, s3
+; RV64ZVBC32-NEXT:    addi s3, s3, 112
+; RV64ZVBC32-NEXT:    vs1r.v v8, (s3) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v8, v11, s5
+; RV64ZVBC32-NEXT:    csrr s3, vlenb
+; RV64ZVBC32-NEXT:    mv s4, s3
+; RV64ZVBC32-NEXT:    slli s3, s3, 1
+; RV64ZVBC32-NEXT:    add s4, s4, s3
+; RV64ZVBC32-NEXT:    slli s3, s3, 1
+; RV64ZVBC32-NEXT:    add s4, s4, s3
+; RV64ZVBC32-NEXT:    slli s3, s3, 2
+; RV64ZVBC32-NEXT:    add s3, s3, s4
+; RV64ZVBC32-NEXT:    add s3, sp, s3
+; RV64ZVBC32-NEXT:    addi s3, s3, 112
+; RV64ZVBC32-NEXT:    vs1r.v v8, (s3) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v8, v11, s6
+; RV64ZVBC32-NEXT:    csrr s3, vlenb
+; RV64ZVBC32-NEXT:    slli s3, s3, 1
+; RV64ZVBC32-NEXT:    mv s4, s3
+; RV64ZVBC32-NEXT:    slli s3, s3, 1
+; RV64ZVBC32-NEXT:    add s4, s4, s3
+; RV64ZVBC32-NEXT:    slli s3, s3, 2
+; RV64ZVBC32-NEXT:    add s3, s3, s4
+; RV64ZVBC32-NEXT:    add s3, sp, s3
+; RV64ZVBC32-NEXT:    addi s3, s3, 112
+; RV64ZVBC32-NEXT:    vs1r.v v8, (s3) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v8, v11, s7
+; RV64ZVBC32-NEXT:    csrr s3, vlenb
+; RV64ZVBC32-NEXT:    mv s4, s3
+; RV64ZVBC32-NEXT:    slli s3, s3, 2
+; RV64ZVBC32-NEXT:    add s4, s4, s3
+; RV64ZVBC32-NEXT:    slli s3, s3, 2
+; RV64ZVBC32-NEXT:    add s3, s3, s4
+; RV64ZVBC32-NEXT:    add s3, sp, s3
+; RV64ZVBC32-NEXT:    addi s3, s3, 112
+; RV64ZVBC32-NEXT:    vs1r.v v8, (s3) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v8, v11, s8
+; RV64ZVBC32-NEXT:    csrr s3, vlenb
+; RV64ZVBC32-NEXT:    slli s3, s3, 2
+; RV64ZVBC32-NEXT:    mv s4, s3
+; RV64ZVBC32-NEXT:    slli s3, s3, 2
+; RV64ZVBC32-NEXT:    add s3, s3, s4
+; RV64ZVBC32-NEXT:    add s3, sp, s3
+; RV64ZVBC32-NEXT:    addi s3, s3, 112
+; RV64ZVBC32-NEXT:    vs1r.v v8, (s3) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v8, v11, s9
+; RV64ZVBC32-NEXT:    csrr s3, vlenb
+; RV64ZVBC32-NEXT:    mv s4, s3
+; RV64ZVBC32-NEXT:    slli s3, s3, 1
+; RV64ZVBC32-NEXT:    add s4, s4, s3
+; RV64ZVBC32-NEXT:    slli s3, s3, 3
+; RV64ZVBC32-NEXT:    add s3, s3, s4
+; RV64ZVBC32-NEXT:    add s3, sp, s3
+; RV64ZVBC32-NEXT:    addi s3, s3, 112
+; RV64ZVBC32-NEXT:    vs1r.v v8, (s3) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v8, v11, s11
+; RV64ZVBC32-NEXT:    csrr s3, vlenb
+; RV64ZVBC32-NEXT:    slli s3, s3, 1
+; RV64ZVBC32-NEXT:    mv s4, s3
+; RV64ZVBC32-NEXT:    slli s3, s3, 3
+; RV64ZVBC32-NEXT:    add s3, s3, s4
+; RV64ZVBC32-NEXT:    add s3, sp, s3
+; RV64ZVBC32-NEXT:    addi s3, s3, 112
+; RV64ZVBC32-NEXT:    vs1r.v v8, (s3) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v8, v11, ra
+; RV64ZVBC32-NEXT:    csrr s3, vlenb
+; RV64ZVBC32-NEXT:    slli s4, s3, 4
+; RV64ZVBC32-NEXT:    add s3, s4, s3
+; RV64ZVBC32-NEXT:    add s3, sp, s3
+; RV64ZVBC32-NEXT:    addi s3, s3, 112
+; RV64ZVBC32-NEXT:    vs1r.v v8, (s3) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v8, v11, t4
+; RV64ZVBC32-NEXT:    csrr t4, vlenb
+; RV64ZVBC32-NEXT:    slli t4, t4, 4
+; RV64ZVBC32-NEXT:    add t4, sp, t4
+; RV64ZVBC32-NEXT:    addi t4, t4, 112
+; RV64ZVBC32-NEXT:    vs1r.v v8, (t4) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v8, v11, t3
+; RV64ZVBC32-NEXT:    csrr t3, vlenb
+; RV64ZVBC32-NEXT:    slli t4, t3, 4
+; RV64ZVBC32-NEXT:    sub t3, t4, t3
+; RV64ZVBC32-NEXT:    add t3, sp, t3
+; RV64ZVBC32-NEXT:    addi t3, t3, 112
+; RV64ZVBC32-NEXT:    vs1r.v v8, (t3) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v8, v11, t2
+; RV64ZVBC32-NEXT:    csrr t2, vlenb
+; RV64ZVBC32-NEXT:    slli t2, t2, 1
+; RV64ZVBC32-NEXT:    mv t3, t2
+; RV64ZVBC32-NEXT:    slli t2, t2, 1
+; RV64ZVBC32-NEXT:    add t3, t3, t2
+; RV64ZVBC32-NEXT:    slli t2, t2, 1
+; RV64ZVBC32-NEXT:    add t2, t2, t3
+; RV64ZVBC32-NEXT:    add t2, sp, t2
+; RV64ZVBC32-NEXT:    addi t2, t2, 112
+; RV64ZVBC32-NEXT:    vs1r.v v8, (t2) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v8, v11, s10
+; RV64ZVBC32-NEXT:    csrr t2, vlenb
+; RV64ZVBC32-NEXT:    mv t3, t2
+; RV64ZVBC32-NEXT:    slli t2, t2, 2
+; RV64ZVBC32-NEXT:    add t3, t3, t2
+; RV64ZVBC32-NEXT:    slli t2, t2, 1
+; RV64ZVBC32-NEXT:    add t2, t2, t3
+; RV64ZVBC32-NEXT:    add t2, sp, t2
+; RV64ZVBC32-NEXT:    addi t2, t2, 112
+; RV64ZVBC32-NEXT:    vs1r.v v8, (t2) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v8, v11, t1
+; RV64ZVBC32-NEXT:    csrr t1, vlenb
+; RV64ZVBC32-NEXT:    slli t1, t1, 2
+; RV64ZVBC32-NEXT:    mv t2, t1
+; RV64ZVBC32-NEXT:    slli t1, t1, 1
+; RV64ZVBC32-NEXT:    add t1, t1, t2
+; RV64ZVBC32-NEXT:    add t1, sp, t1
+; RV64ZVBC32-NEXT:    addi t1, t1, 112
+; RV64ZVBC32-NEXT:    vs1r.v v8, (t1) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v8, v11, a7
+; RV64ZVBC32-NEXT:    csrr a7, vlenb
+; RV64ZVBC32-NEXT:    mv t1, a7
+; RV64ZVBC32-NEXT:    slli a7, a7, 1
+; RV64ZVBC32-NEXT:    add t1, t1, a7
+; RV64ZVBC32-NEXT:    slli a7, a7, 2
+; RV64ZVBC32-NEXT:    add a7, a7, t1
+; RV64ZVBC32-NEXT:    add a7, sp, a7
+; RV64ZVBC32-NEXT:    addi a7, a7, 112
+; RV64ZVBC32-NEXT:    vs1r.v v8, (a7) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v8, v11, a6
+; RV64ZVBC32-NEXT:    csrr a6, vlenb
+; RV64ZVBC32-NEXT:    slli a6, a6, 1
+; RV64ZVBC32-NEXT:    mv a7, a6
+; RV64ZVBC32-NEXT:    slli a6, a6, 2
+; RV64ZVBC32-NEXT:    add a6, a6, a7
+; RV64ZVBC32-NEXT:    add a6, sp, a6
+; RV64ZVBC32-NEXT:    addi a6, a6, 112
+; RV64ZVBC32-NEXT:    vs1r.v v8, (a6) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v8, v11, a5
+; RV64ZVBC32-NEXT:    csrr a5, vlenb
+; RV64ZVBC32-NEXT:    slli a6, a5, 3
+; RV64ZVBC32-NEXT:    add a5, a6, a5
+; RV64ZVBC32-NEXT:    add a5, sp, a5
+; RV64ZVBC32-NEXT:    addi a5, a5, 112
+; RV64ZVBC32-NEXT:    vs1r.v v8, (a5) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v8, v11, a4
+; RV64ZVBC32-NEXT:    csrr a4, vlenb
+; RV64ZVBC32-NEXT:    slli a4, a4, 3
+; RV64ZVBC32-NEXT:    add a4, sp, a4
+; RV64ZVBC32-NEXT:    addi a4, a4, 112
+; RV64ZVBC32-NEXT:    vs1r.v v8, (a4) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v8, v11, a2
+; RV64ZVBC32-NEXT:    csrr a2, vlenb
+; RV64ZVBC32-NEXT:    slli a4, a2, 3
+; RV64ZVBC32-NEXT:    sub a2, a4, a2
+; RV64ZVBC32-NEXT:    add a2, sp, a2
+; RV64ZVBC32-NEXT:    addi a2, a2, 112
+; RV64ZVBC32-NEXT:    vs1r.v v8, (a2) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vx v8, v11, a1
+; RV64ZVBC32-NEXT:    csrr a1, vlenb
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    mv a2, a1
+; RV64ZVBC32-NEXT:    slli a1, a1, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a2
+; RV64ZVBC32-NEXT:    add a1, sp, a1
+; RV64ZVBC32-NEXT:    addi a1, a1, 112
+; RV64ZVBC32-NEXT:    vs1r.v v8, (a1) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vand.vi v8, v11, 2
+; RV64ZVBC32-NEXT:    vand.vi v9, v11, 1
+; RV64ZVBC32-NEXT:    vand.vi v10, v11, 4
+; RV64ZVBC32-NEXT:    vand.vi v12, v11, 8
+; RV64ZVBC32-NEXT:    vand.vx v6, v11, a3
+; RV64ZVBC32-NEXT:    vand.vx v5, v11, t0
+; RV64ZVBC32-NEXT:    vand.vx v2, v11, a0
+; RV64ZVBC32-NEXT:    vmul.vv v3, v4, v8
+; RV64ZVBC32-NEXT:    vmul.vv v8, v4, v9
+; RV64ZVBC32-NEXT:    vmul.vv v9, v4, v10
+; RV64ZVBC32-NEXT:    vmul.vv v10, v4, v12
+; RV64ZVBC32-NEXT:    vmul.vv v11, v4, v13
+; RV64ZVBC32-NEXT:    vmul.vv v12, v4, v14
+; RV64ZVBC32-NEXT:    vmul.vv v13, v4, v15
+; RV64ZVBC32-NEXT:    vmul.vv v14, v4, v17
+; RV64ZVBC32-NEXT:    vmul.vv v15, v4, v19
+; RV64ZVBC32-NEXT:    vmul.vv v16, v4, v20
+; RV64ZVBC32-NEXT:    vmul.vv v17, v4, v21
+; RV64ZVBC32-NEXT:    vmul.vv v18, v4, v22
+; RV64ZVBC32-NEXT:    vmul.vv v19, v4, v23
+; RV64ZVBC32-NEXT:    vmul.vv v20, v4, v24
+; RV64ZVBC32-NEXT:    vmul.vv v21, v4, v25
+; RV64ZVBC32-NEXT:    vmul.vv v22, v4, v26
+; RV64ZVBC32-NEXT:    vmul.vv v23, v4, v27
+; RV64ZVBC32-NEXT:    vmul.vv v24, v4, v28
+; RV64ZVBC32-NEXT:    vmul.vv v25, v4, v29
+; RV64ZVBC32-NEXT:    vmul.vv v26, v4, v30
+; RV64ZVBC32-NEXT:    vmul.vv v27, v4, v31
+; RV64ZVBC32-NEXT:    vmul.vv v28, v4, v7
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vl1r.v v29, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v29, v4, v29
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vl1r.v v30, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v30, v4, v30
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vl1r.v v31, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v31, v4, v31
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vl1r.v v7, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v7, v4, v7
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v1, v4, v1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a1, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a1, a0
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v1, v4, v1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v1, v4, v1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a1, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a1, a0
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v1, v4, v0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v1, v4, v0
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v1, v4, v0
+; RV64ZVBC32-NEXT:    addi a0, sp, 112
+; RV64ZVBC32-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v0, v4, v1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a1, a0, 5
+; RV64ZVBC32-NEXT:    add a0, a1, a0
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v1, v4, v1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 5
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v1, v4, v1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a1, a0, 5
+; RV64ZVBC32-NEXT:    add a0, a1, a0
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a1, a0, 5
+; RV64ZVBC32-NEXT:    sub a0, a1, a0
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v1, v4, v1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 5
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v1, v4, v1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a1, a0, 5
+; RV64ZVBC32-NEXT:    sub a0, a1, a0
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v1, v4, v1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v1, v4, v1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v1, v4, v1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v1, v4, v1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v1, v4, v1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v1, v4, v1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v1, v4, v1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v1, v4, v1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v1, v4, v1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v1, v4, v1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v1, v4, v1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v1, v4, v1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a1, a0, 4
+; RV64ZVBC32-NEXT:    add a0, a1, a0
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v1, v4, v1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v1, v4, v1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a1, a0, 4
+; RV64ZVBC32-NEXT:    add a0, a1, a0
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a1, a0, 4
+; RV64ZVBC32-NEXT:    sub a0, a1, a0
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v1, v4, v1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v1, v4, v1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v1, v4, v1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v1, v4, v1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v1, v4, v1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 4
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v1, v4, v1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a1, a0, 3
+; RV64ZVBC32-NEXT:    add a0, a1, a0
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v1, v4, v1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 3
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v1, v4, v1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a1, a0, 3
+; RV64ZVBC32-NEXT:    sub a0, a1, a0
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v1, v4, v1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vmul.vv v1, v4, v1
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64ZVBC32-NEXT:    vmul.vv v6, v4, v6
+; RV64ZVBC32-NEXT:    vmul.vv v5, v4, v5
+; RV64ZVBC32-NEXT:    vmul.vv v4, v4, v2
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v3
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v9
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v10
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v11
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v12
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v13
+; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
+; RV64ZVBC32-NEXT:    vxor.vv v9, v8, v15
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v16
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v17
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v18
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v19
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v20
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v21
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v22
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v23
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v24
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v25
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v26
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v27
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v28
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v29
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v30
+; RV64ZVBC32-NEXT:    vxor.vv v10, v9, v31
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v7
+; RV64ZVBC32-NEXT:    li a1, 56
+; RV64ZVBC32-NEXT:    vsll.vx v8, v8, a1
+; RV64ZVBC32-NEXT:    vand.vx v9, v9, t6
+; RV64ZVBC32-NEXT:    li a0, 40
+; RV64ZVBC32-NEXT:    vsll.vx v9, v9, a0
+; RV64ZVBC32-NEXT:    vor.vv v8, v8, v9
+; RV64ZVBC32-NEXT:    csrr a2, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a2, 2
+; RV64ZVBC32-NEXT:    add a2, a3, a2
+; RV64ZVBC32-NEXT:    add a2, sp, a2
+; RV64ZVBC32-NEXT:    addi a2, a2, 112
+; RV64ZVBC32-NEXT:    vl1r.v v9, (a2) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v9, v10, v9
+; RV64ZVBC32-NEXT:    csrr a2, vlenb
+; RV64ZVBC32-NEXT:    slli a2, a2, 2
+; RV64ZVBC32-NEXT:    add a2, sp, a2
+; RV64ZVBC32-NEXT:    addi a2, a2, 112
+; RV64ZVBC32-NEXT:    vl1r.v v10, (a2) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    csrr a2, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a2, 1
+; RV64ZVBC32-NEXT:    add a2, a3, a2
+; RV64ZVBC32-NEXT:    add a2, sp, a2
+; RV64ZVBC32-NEXT:    addi a2, a2, 112
+; RV64ZVBC32-NEXT:    vl1r.v v10, (a2) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    csrr a2, vlenb
+; RV64ZVBC32-NEXT:    slli a2, a2, 1
+; RV64ZVBC32-NEXT:    add a2, sp, a2
+; RV64ZVBC32-NEXT:    addi a2, a2, 112
+; RV64ZVBC32-NEXT:    vl1r.v v10, (a2) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    csrr a2, vlenb
+; RV64ZVBC32-NEXT:    add a2, sp, a2
+; RV64ZVBC32-NEXT:    addi a2, a2, 112
+; RV64ZVBC32-NEXT:    vl1r.v v10, (a2) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    addi a2, sp, 112
+; RV64ZVBC32-NEXT:    vl1r.v v10, (a2) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v0
+; RV64ZVBC32-NEXT:    csrr a2, vlenb
+; RV64ZVBC32-NEXT:    slli a2, a2, 1
+; RV64ZVBC32-NEXT:    mv a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 4
+; RV64ZVBC32-NEXT:    add a2, a2, a3
+; RV64ZVBC32-NEXT:    add a2, sp, a2
+; RV64ZVBC32-NEXT:    addi a2, a2, 112
+; RV64ZVBC32-NEXT:    vl1r.v v10, (a2) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    csrr a2, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a2, 5
+; RV64ZVBC32-NEXT:    add a2, a3, a2
+; RV64ZVBC32-NEXT:    add a2, sp, a2
+; RV64ZVBC32-NEXT:    addi a2, a2, 112
+; RV64ZVBC32-NEXT:    vl1r.v v10, (a2) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    csrr a2, vlenb
+; RV64ZVBC32-NEXT:    slli a2, a2, 5
+; RV64ZVBC32-NEXT:    add a2, sp, a2
+; RV64ZVBC32-NEXT:    addi a2, a2, 112
+; RV64ZVBC32-NEXT:    vl1r.v v10, (a2) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    csrr a2, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a2, 5
+; RV64ZVBC32-NEXT:    sub a2, a3, a2
+; RV64ZVBC32-NEXT:    add a2, sp, a2
+; RV64ZVBC32-NEXT:    addi a2, a2, 112
+; RV64ZVBC32-NEXT:    vl1r.v v10, (a2) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    csrr a2, vlenb
+; RV64ZVBC32-NEXT:    slli a2, a2, 1
+; RV64ZVBC32-NEXT:    mv a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 1
+; RV64ZVBC32-NEXT:    add a2, a2, a3
+; RV64ZVBC32-NEXT:    add a2, sp, a2
+; RV64ZVBC32-NEXT:    addi a2, a2, 112
+; RV64ZVBC32-NEXT:    vl1r.v v10, (a2) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    csrr a2, vlenb
+; RV64ZVBC32-NEXT:    mv a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 2
+; RV64ZVBC32-NEXT:    add a3, a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 1
+; RV64ZVBC32-NEXT:    add a2, a2, a3
+; RV64ZVBC32-NEXT:    add a2, sp, a2
+; RV64ZVBC32-NEXT:    addi a2, a2, 112
+; RV64ZVBC32-NEXT:    vl1r.v v10, (a2) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    csrr a2, vlenb
+; RV64ZVBC32-NEXT:    slli a2, a2, 2
+; RV64ZVBC32-NEXT:    mv a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 1
+; RV64ZVBC32-NEXT:    add a2, a2, a3
+; RV64ZVBC32-NEXT:    add a2, sp, a2
+; RV64ZVBC32-NEXT:    addi a2, a2, 112
+; RV64ZVBC32-NEXT:    vl1r.v v10, (a2) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    csrr a2, vlenb
+; RV64ZVBC32-NEXT:    mv a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 2
+; RV64ZVBC32-NEXT:    add a3, a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 1
+; RV64ZVBC32-NEXT:    add a2, a2, a3
+; RV64ZVBC32-NEXT:    add a2, sp, a2
+; RV64ZVBC32-NEXT:    addi a2, a2, 112
+; RV64ZVBC32-NEXT:    vl1r.v v10, (a2) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v10, v9, v10
+; RV64ZVBC32-NEXT:    csrr a2, vlenb
+; RV64ZVBC32-NEXT:    slli a2, a2, 1
+; RV64ZVBC32-NEXT:    mv a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 2
+; RV64ZVBC32-NEXT:    add a3, a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 1
+; RV64ZVBC32-NEXT:    add a2, a2, a3
+; RV64ZVBC32-NEXT:    add a2, sp, a2
+; RV64ZVBC32-NEXT:    addi a2, a2, 112
+; RV64ZVBC32-NEXT:    vl1r.v v11, (a2) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    csrr a2, vlenb
+; RV64ZVBC32-NEXT:    mv a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 3
+; RV64ZVBC32-NEXT:    add a3, a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 1
+; RV64ZVBC32-NEXT:    add a2, a2, a3
+; RV64ZVBC32-NEXT:    add a2, sp, a2
+; RV64ZVBC32-NEXT:    addi a2, a2, 112
+; RV64ZVBC32-NEXT:    vl1r.v v11, (a2) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    csrr a2, vlenb
+; RV64ZVBC32-NEXT:    slli a2, a2, 3
+; RV64ZVBC32-NEXT:    mv a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 1
+; RV64ZVBC32-NEXT:    add a2, a2, a3
+; RV64ZVBC32-NEXT:    add a2, sp, a2
+; RV64ZVBC32-NEXT:    addi a2, a2, 112
+; RV64ZVBC32-NEXT:    vl1r.v v11, (a2) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    csrr a2, vlenb
+; RV64ZVBC32-NEXT:    mv a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 2
+; RV64ZVBC32-NEXT:    add a2, a2, a3
+; RV64ZVBC32-NEXT:    add a2, sp, a2
+; RV64ZVBC32-NEXT:    addi a2, a2, 112
+; RV64ZVBC32-NEXT:    vl1r.v v11, (a2) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    csrr a2, vlenb
+; RV64ZVBC32-NEXT:    slli a2, a2, 1
+; RV64ZVBC32-NEXT:    mv a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 2
+; RV64ZVBC32-NEXT:    add a2, a2, a3
+; RV64ZVBC32-NEXT:    add a2, sp, a2
+; RV64ZVBC32-NEXT:    addi a2, a2, 112
+; RV64ZVBC32-NEXT:    vl1r.v v11, (a2) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    csrr a2, vlenb
+; RV64ZVBC32-NEXT:    mv a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 2
+; RV64ZVBC32-NEXT:    add a3, a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 2
+; RV64ZVBC32-NEXT:    add a2, a2, a3
+; RV64ZVBC32-NEXT:    add a2, sp, a2
+; RV64ZVBC32-NEXT:    addi a2, a2, 112
+; RV64ZVBC32-NEXT:    vl1r.v v11, (a2) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    csrr a2, vlenb
+; RV64ZVBC32-NEXT:    slli a2, a2, 2
+; RV64ZVBC32-NEXT:    mv a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 2
+; RV64ZVBC32-NEXT:    add a2, a2, a3
+; RV64ZVBC32-NEXT:    add a2, sp, a2
+; RV64ZVBC32-NEXT:    addi a2, a2, 112
+; RV64ZVBC32-NEXT:    vl1r.v v11, (a2) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    csrr a2, vlenb
+; RV64ZVBC32-NEXT:    mv a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 3
+; RV64ZVBC32-NEXT:    add a2, a2, a3
+; RV64ZVBC32-NEXT:    add a2, sp, a2
+; RV64ZVBC32-NEXT:    addi a2, a2, 112
+; RV64ZVBC32-NEXT:    vl1r.v v11, (a2) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v11, v10, v11
+; RV64ZVBC32-NEXT:    csrr a2, vlenb
+; RV64ZVBC32-NEXT:    slli a2, a2, 1
+; RV64ZVBC32-NEXT:    mv a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 3
+; RV64ZVBC32-NEXT:    add a2, a2, a3
+; RV64ZVBC32-NEXT:    add a2, sp, a2
+; RV64ZVBC32-NEXT:    addi a2, a2, 112
+; RV64ZVBC32-NEXT:    vl1r.v v12, (a2) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v11, v11, v12
+; RV64ZVBC32-NEXT:    csrr a2, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a2, 4
+; RV64ZVBC32-NEXT:    add a2, a3, a2
+; RV64ZVBC32-NEXT:    add a2, sp, a2
+; RV64ZVBC32-NEXT:    addi a2, a2, 112
+; RV64ZVBC32-NEXT:    vl1r.v v12, (a2) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v11, v11, v12
+; RV64ZVBC32-NEXT:    vsrl.vi v12, v9, 8
+; RV64ZVBC32-NEXT:    vand.vx v12, v12, t5
+; RV64ZVBC32-NEXT:    vsrl.vi v10, v10, 24
+; RV64ZVBC32-NEXT:    lui a2, 4080
+; RV64ZVBC32-NEXT:    vand.vx v10, v10, a2
+; RV64ZVBC32-NEXT:    vor.vv v10, v12, v10
+; RV64ZVBC32-NEXT:    csrr a3, vlenb
+; RV64ZVBC32-NEXT:    mv a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 3
+; RV64ZVBC32-NEXT:    add a3, a3, a4
+; RV64ZVBC32-NEXT:    add a3, sp, a3
+; RV64ZVBC32-NEXT:    addi a3, a3, 112
+; RV64ZVBC32-NEXT:    vl1r.v v12, (a3) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v11, v11, v12
+; RV64ZVBC32-NEXT:    csrr a3, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    mv a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 3
+; RV64ZVBC32-NEXT:    add a3, a3, a4
+; RV64ZVBC32-NEXT:    add a3, sp, a3
+; RV64ZVBC32-NEXT:    addi a3, a3, 112
+; RV64ZVBC32-NEXT:    vl1r.v v12, (a3) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v11, v11, v12
+; RV64ZVBC32-NEXT:    csrr a3, vlenb
+; RV64ZVBC32-NEXT:    mv a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 2
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 3
+; RV64ZVBC32-NEXT:    add a3, a3, a4
+; RV64ZVBC32-NEXT:    add a3, sp, a3
+; RV64ZVBC32-NEXT:    addi a3, a3, 112
+; RV64ZVBC32-NEXT:    vl1r.v v12, (a3) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v11, v11, v12
+; RV64ZVBC32-NEXT:    csrr a3, vlenb
+; RV64ZVBC32-NEXT:    slli a3, a3, 2
+; RV64ZVBC32-NEXT:    mv a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 3
+; RV64ZVBC32-NEXT:    add a3, a3, a4
+; RV64ZVBC32-NEXT:    add a3, sp, a3
+; RV64ZVBC32-NEXT:    addi a3, a3, 112
+; RV64ZVBC32-NEXT:    vl1r.v v12, (a3) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v11, v11, v12
+; RV64ZVBC32-NEXT:    csrr a3, vlenb
+; RV64ZVBC32-NEXT:    mv a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 1
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, a3, 4
+; RV64ZVBC32-NEXT:    add a3, a3, a4
+; RV64ZVBC32-NEXT:    add a3, sp, a3
+; RV64ZVBC32-NEXT:    addi a3, a3, 112
+; RV64ZVBC32-NEXT:    vl1r.v v12, (a3) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v11, v11, v12
+; RV64ZVBC32-NEXT:    vand.vx v9, v9, a2
+; RV64ZVBC32-NEXT:    vsll.vi v9, v9, 24
+; RV64ZVBC32-NEXT:    csrr a2, vlenb
+; RV64ZVBC32-NEXT:    mv a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 3
+; RV64ZVBC32-NEXT:    add a3, a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 2
+; RV64ZVBC32-NEXT:    add a2, a2, a3
+; RV64ZVBC32-NEXT:    add a2, sp, a2
+; RV64ZVBC32-NEXT:    addi a2, a2, 112
+; RV64ZVBC32-NEXT:    vl1r.v v12, (a2) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v12, v11, v12
+; RV64ZVBC32-NEXT:    csrr a2, vlenb
+; RV64ZVBC32-NEXT:    slli a2, a2, 3
+; RV64ZVBC32-NEXT:    mv a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 2
+; RV64ZVBC32-NEXT:    add a2, a2, a3
+; RV64ZVBC32-NEXT:    add a2, sp, a2
+; RV64ZVBC32-NEXT:    addi a2, a2, 112
+; RV64ZVBC32-NEXT:    vl1r.v v13, (a2) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v13
+; RV64ZVBC32-NEXT:    vand.vx v13, v11, t5
+; RV64ZVBC32-NEXT:    vsll.vi v13, v13, 8
+; RV64ZVBC32-NEXT:    vor.vv v9, v9, v13
+; RV64ZVBC32-NEXT:    csrr a2, vlenb
+; RV64ZVBC32-NEXT:    slli a2, a2, 1
+; RV64ZVBC32-NEXT:    mv a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 2
+; RV64ZVBC32-NEXT:    add a3, a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 2
+; RV64ZVBC32-NEXT:    add a2, a2, a3
+; RV64ZVBC32-NEXT:    add a2, sp, a2
+; RV64ZVBC32-NEXT:    addi a2, a2, 112
+; RV64ZVBC32-NEXT:    vl1r.v v13, (a2) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v13
+; RV64ZVBC32-NEXT:    vor.vv v8, v8, v9
+; RV64ZVBC32-NEXT:    csrr a2, vlenb
+; RV64ZVBC32-NEXT:    mv a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 1
+; RV64ZVBC32-NEXT:    add a3, a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 2
+; RV64ZVBC32-NEXT:    add a3, a3, a2
+; RV64ZVBC32-NEXT:    slli a2, a2, 2
+; RV64ZVBC32-NEXT:    add a2, a2, a3
+; RV64ZVBC32-NEXT:    add a2, sp, a2
+; RV64ZVBC32-NEXT:    addi a2, a2, 112
+; RV64ZVBC32-NEXT:    vl1r.v v9, (a2) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v9, v12, v9
+; RV64ZVBC32-NEXT:    vsrl.vx v11, v11, a0
+; RV64ZVBC32-NEXT:    vand.vx v11, v11, t6
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    mv a2, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a2, a2, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a2
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vl1r.v v12, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v12
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v6
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v5
+; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v4
+; RV64ZVBC32-NEXT:    vsrl.vx v9, v9, a1
+; RV64ZVBC32-NEXT:    vor.vv v9, v11, v9
+; RV64ZVBC32-NEXT:    vor.vv v9, v10, v9
+; RV64ZVBC32-NEXT:    vor.vv v8, v8, v9
+; RV64ZVBC32-NEXT:    vsrl.vi v9, v8, 4
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, s0
+; RV64ZVBC32-NEXT:    vand.vx v9, v9, s0
+; RV64ZVBC32-NEXT:    vsll.vi v8, v8, 4
+; RV64ZVBC32-NEXT:    vor.vv v8, v9, v8
+; RV64ZVBC32-NEXT:    vsrl.vi v9, v8, 2
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, s1
+; RV64ZVBC32-NEXT:    vand.vx v9, v9, s1
+; RV64ZVBC32-NEXT:    vsll.vi v8, v8, 2
+; RV64ZVBC32-NEXT:    vor.vv v8, v9, v8
+; RV64ZVBC32-NEXT:    vsrl.vi v9, v8, 1
+; RV64ZVBC32-NEXT:    vand.vx v8, v8, s2
+; RV64ZVBC32-NEXT:    vand.vx v9, v9, s2
+; RV64ZVBC32-NEXT:    vadd.vv v8, v8, v8
+; RV64ZVBC32-NEXT:    vor.vv v9, v9, v8
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add a0, sp, a0
+; RV64ZVBC32-NEXT:    addi a0, a0, 112
+; RV64ZVBC32-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    vsrl.vi v8, v9, 1, v0.t
+; RV64ZVBC32-NEXT:    csrr a0, vlenb
+; RV64ZVBC32-NEXT:    mv a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 1
+; RV64ZVBC32-NEXT:    add a1, a1, a0
+; RV64ZVBC32-NEXT:    slli a0, a0, 2
+; RV64ZVBC32-NEXT:    add a0, a0, a1
+; RV64ZVBC32-NEXT:    add sp, sp, a0
+; RV64ZVBC32-NEXT:    .cfi_def_cfa sp, 224
+; RV64ZVBC32-NEXT:    ld ra, 216(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    ld s0, 208(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    ld s1, 200(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    ld s2, 192(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    ld s3, 184(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    ld s4, 176(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    ld s5, 168(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    ld s6, 160(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    ld s7, 152(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    ld s8, 144(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    ld s9, 136(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    ld s10, 128(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    ld s11, 120(sp) # 8-byte Folded Reload
+; RV64ZVBC32-NEXT:    .cfi_restore ra
+; RV64ZVBC32-NEXT:    .cfi_restore s0
+; RV64ZVBC32-NEXT:    .cfi_restore s1
+; RV64ZVBC32-NEXT:    .cfi_restore s2
+; RV64ZVBC32-NEXT:    .cfi_restore s3
+; RV64ZVBC32-NEXT:    .cfi_restore s4
+; RV64ZVBC32-NEXT:    .cfi_restore s5
+; RV64ZVBC32-NEXT:    .cfi_restore s6
+; RV64ZVBC32-NEXT:    .cfi_restore s7
+; RV64ZVBC32-NEXT:    .cfi_restore s8
+; RV64ZVBC32-NEXT:    .cfi_restore s9
+; RV64ZVBC32-NEXT:    .cfi_restore s10
+; RV64ZVBC32-NEXT:    .cfi_restore s11
+; RV64ZVBC32-NEXT:    addi sp, sp, 224
+; RV64ZVBC32-NEXT:    .cfi_def_cfa_offset 0
+; RV64ZVBC32-NEXT:    ret
   %va.ext = zext <vscale x 1 x i64> %va to <vscale x 1 x i128>
   %vb.ext = zext <vscale x 1 x i64> %vb to <vscale x 1 x i128>
   %clmul = call <vscale x 1 x i128> @llvm.clmul.nxv1i128(<vscale x 1 x i128> %va.ext, <vscale x 1 x i128> %vb.ext)
@@ -46773,25 +85131,971 @@ define <vscale x 1 x i64> @clmulh_nxv1i64_vx_mask(<vscale x 1 x i64> %va, i64 %b
 ; RV64V-NEXT:    vsrl.vi v8, v9, 1, v0.t
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmulh_nxv1i64_vx_mask:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    addi sp, sp, -16
-; RV32ZVBC-NEXT:    .cfi_def_cfa_offset 16
-; RV32ZVBC-NEXT:    sw a0, 8(sp)
-; RV32ZVBC-NEXT:    sw a1, 12(sp)
-; RV32ZVBC-NEXT:    addi a0, sp, 8
-; RV32ZVBC-NEXT:    vsetvli a1, zero, e64, m1, ta, mu
-; RV32ZVBC-NEXT:    vlse64.v v9, (a0), zero
-; RV32ZVBC-NEXT:    vclmulh.vv v8, v8, v9, v0.t
-; RV32ZVBC-NEXT:    addi sp, sp, 16
-; RV32ZVBC-NEXT:    .cfi_def_cfa_offset 0
-; RV32ZVBC-NEXT:    ret
+; RV32ZVBC64-LABEL: clmulh_nxv1i64_vx_mask:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    addi sp, sp, -16
+; RV32ZVBC64-NEXT:    .cfi_def_cfa_offset 16
+; RV32ZVBC64-NEXT:    sw a0, 8(sp)
+; RV32ZVBC64-NEXT:    sw a1, 12(sp)
+; RV32ZVBC64-NEXT:    addi a0, sp, 8
+; RV32ZVBC64-NEXT:    vsetvli a1, zero, e64, m1, ta, mu
+; RV32ZVBC64-NEXT:    vlse64.v v9, (a0), zero
+; RV32ZVBC64-NEXT:    vclmulh.vv v8, v8, v9, v0.t
+; RV32ZVBC64-NEXT:    addi sp, sp, 16
+; RV32ZVBC64-NEXT:    .cfi_def_cfa_offset 0
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmulh_nxv1i64_vx_mask:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a1, zero, e64, m1, ta, mu
+; RV64ZVBC64-NEXT:    vclmulh.vx v8, v8, a0, v0.t
+; RV64ZVBC64-NEXT:    ret
+;
+; RV32ZVBC32-LABEL: clmulh_nxv1i64_vx_mask:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    addi sp, sp, -336
+; RV32ZVBC32-NEXT:    .cfi_def_cfa_offset 336
+; RV32ZVBC32-NEXT:    sw ra, 332(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s0, 328(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s1, 324(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s2, 320(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s3, 316(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s4, 312(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s5, 308(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s6, 304(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s7, 300(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s8, 296(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s9, 292(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s10, 288(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    sw s11, 284(sp) # 4-byte Folded Spill
+; RV32ZVBC32-NEXT:    .cfi_offset ra, -4
+; RV32ZVBC32-NEXT:    .cfi_offset s0, -8
+; RV32ZVBC32-NEXT:    .cfi_offset s1, -12
+; RV32ZVBC32-NEXT:    .cfi_offset s2, -16
+; RV32ZVBC32-NEXT:    .cfi_offset s3, -20
+; RV32ZVBC32-NEXT:    .cfi_offset s4, -24
+; RV32ZVBC32-NEXT:    .cfi_offset s5, -28
+; RV32ZVBC32-NEXT:    .cfi_offset s6, -32
+; RV32ZVBC32-NEXT:    .cfi_offset s7, -36
+; RV32ZVBC32-NEXT:    .cfi_offset s8, -40
+; RV32ZVBC32-NEXT:    .cfi_offset s9, -44
+; RV32ZVBC32-NEXT:    .cfi_offset s10, -48
+; RV32ZVBC32-NEXT:    .cfi_offset s11, -52
+; RV32ZVBC32-NEXT:    sw a0, 0(sp)
+; RV32ZVBC32-NEXT:    sw a1, 4(sp)
+; RV32ZVBC32-NEXT:    mv t6, sp
+; RV32ZVBC32-NEXT:    lui s0, 1044480
+; RV32ZVBC32-NEXT:    li s11, 1
+; RV32ZVBC32-NEXT:    li s6, 2
+; RV32ZVBC32-NEXT:    li s3, 4
+; RV32ZVBC32-NEXT:    li s8, 8
+; RV32ZVBC32-NEXT:    li s10, 32
+; RV32ZVBC32-NEXT:    li s9, 64
+; RV32ZVBC32-NEXT:    li s5, 128
+; RV32ZVBC32-NEXT:    li s4, 256
+; RV32ZVBC32-NEXT:    li s2, 512
+; RV32ZVBC32-NEXT:    li s1, 1024
+; RV32ZVBC32-NEXT:    lui ra, 1
+; RV32ZVBC32-NEXT:    lui t5, 2
+; RV32ZVBC32-NEXT:    lui t4, 4
+; RV32ZVBC32-NEXT:    lui t3, 8
+; RV32ZVBC32-NEXT:    lui t2, 16
+; RV32ZVBC32-NEXT:    lui t1, 32
+; RV32ZVBC32-NEXT:    lui t0, 64
+; RV32ZVBC32-NEXT:    lui a7, 128
+; RV32ZVBC32-NEXT:    lui a6, 256
+; RV32ZVBC32-NEXT:    lui a5, 512
+; RV32ZVBC32-NEXT:    lui a4, 1024
+; RV32ZVBC32-NEXT:    lui a3, 2048
+; RV32ZVBC32-NEXT:    lui a2, 4096
+; RV32ZVBC32-NEXT:    lui a1, 8192
+; RV32ZVBC32-NEXT:    vsetvli s7, zero, e64, m1, ta, ma
+; RV32ZVBC32-NEXT:    vlse64.v v13, (t6), zero
+; RV32ZVBC32-NEXT:    lui t6, 16384
+; RV32ZVBC32-NEXT:    sw s0, 248(sp)
+; RV32ZVBC32-NEXT:    lui s0, 32768
+; RV32ZVBC32-NEXT:    sw zero, 252(sp)
+; RV32ZVBC32-NEXT:    lui a0, 524288
+; RV32ZVBC32-NEXT:    sw a0, 8(sp)
+; RV32ZVBC32-NEXT:    sw zero, 12(sp)
+; RV32ZVBC32-NEXT:    sw zero, 272(sp)
+; RV32ZVBC32-NEXT:    sw s11, 276(sp)
+; RV32ZVBC32-NEXT:    sw zero, 256(sp)
+; RV32ZVBC32-NEXT:    sw s6, 260(sp)
+; RV32ZVBC32-NEXT:    lui s6, 65536
+; RV32ZVBC32-NEXT:    sw zero, 264(sp)
+; RV32ZVBC32-NEXT:    sw s3, 268(sp)
+; RV32ZVBC32-NEXT:    lui s7, 131072
+; RV32ZVBC32-NEXT:    sw zero, 240(sp)
+; RV32ZVBC32-NEXT:    sw s8, 244(sp)
+; RV32ZVBC32-NEXT:    lui s8, 262144
+; RV32ZVBC32-NEXT:    sw zero, 232(sp)
+; RV32ZVBC32-NEXT:    li s3, 16
+; RV32ZVBC32-NEXT:    sw s3, 236(sp)
+; RV32ZVBC32-NEXT:    li s3, 16
+; RV32ZVBC32-NEXT:    sw zero, 224(sp)
+; RV32ZVBC32-NEXT:    sw s10, 228(sp)
+; RV32ZVBC32-NEXT:    sw zero, 216(sp)
+; RV32ZVBC32-NEXT:    sw s9, 220(sp)
+; RV32ZVBC32-NEXT:    li s10, 64
+; RV32ZVBC32-NEXT:    sw zero, 208(sp)
+; RV32ZVBC32-NEXT:    sw s5, 212(sp)
+; RV32ZVBC32-NEXT:    sw zero, 200(sp)
+; RV32ZVBC32-NEXT:    sw s4, 204(sp)
+; RV32ZVBC32-NEXT:    sw zero, 192(sp)
+; RV32ZVBC32-NEXT:    sw s2, 196(sp)
+; RV32ZVBC32-NEXT:    li s4, 512
+; RV32ZVBC32-NEXT:    sw zero, 184(sp)
+; RV32ZVBC32-NEXT:    sw s1, 188(sp)
+; RV32ZVBC32-NEXT:    li s2, 1024
+; RV32ZVBC32-NEXT:    slli s11, s11, 11
+; RV32ZVBC32-NEXT:    sw zero, 176(sp)
+; RV32ZVBC32-NEXT:    sw s11, 180(sp)
+; RV32ZVBC32-NEXT:    sw zero, 168(sp)
+; RV32ZVBC32-NEXT:    sw ra, 172(sp)
+; RV32ZVBC32-NEXT:    sw zero, 160(sp)
+; RV32ZVBC32-NEXT:    sw t5, 164(sp)
+; RV32ZVBC32-NEXT:    lui s5, 2
+; RV32ZVBC32-NEXT:    sw zero, 152(sp)
+; RV32ZVBC32-NEXT:    sw t4, 156(sp)
+; RV32ZVBC32-NEXT:    lui s1, 4
+; RV32ZVBC32-NEXT:    sw zero, 144(sp)
+; RV32ZVBC32-NEXT:    sw t3, 148(sp)
+; RV32ZVBC32-NEXT:    lui t4, 8
+; RV32ZVBC32-NEXT:    sw zero, 136(sp)
+; RV32ZVBC32-NEXT:    sw t2, 140(sp)
+; RV32ZVBC32-NEXT:    lui t5, 16
+; RV32ZVBC32-NEXT:    sw zero, 128(sp)
+; RV32ZVBC32-NEXT:    sw t1, 132(sp)
+; RV32ZVBC32-NEXT:    lui t3, 32
+; RV32ZVBC32-NEXT:    sw zero, 120(sp)
+; RV32ZVBC32-NEXT:    sw t0, 124(sp)
+; RV32ZVBC32-NEXT:    lui t1, 64
+; RV32ZVBC32-NEXT:    sw zero, 112(sp)
+; RV32ZVBC32-NEXT:    sw a7, 116(sp)
+; RV32ZVBC32-NEXT:    lui t2, 128
+; RV32ZVBC32-NEXT:    sw zero, 104(sp)
+; RV32ZVBC32-NEXT:    sw a6, 108(sp)
+; RV32ZVBC32-NEXT:    sw zero, 96(sp)
+; RV32ZVBC32-NEXT:    sw a5, 100(sp)
+; RV32ZVBC32-NEXT:    lui t0, 512
+; RV32ZVBC32-NEXT:    sw zero, 88(sp)
+; RV32ZVBC32-NEXT:    sw a4, 92(sp)
+; RV32ZVBC32-NEXT:    lui a7, 1024
+; RV32ZVBC32-NEXT:    sw zero, 80(sp)
+; RV32ZVBC32-NEXT:    sw a3, 84(sp)
+; RV32ZVBC32-NEXT:    lui a4, 2048
+; RV32ZVBC32-NEXT:    sw zero, 72(sp)
+; RV32ZVBC32-NEXT:    sw a2, 76(sp)
+; RV32ZVBC32-NEXT:    lui a5, 4096
+; RV32ZVBC32-NEXT:    sw zero, 64(sp)
+; RV32ZVBC32-NEXT:    sw a1, 68(sp)
+; RV32ZVBC32-NEXT:    lui a3, 8192
+; RV32ZVBC32-NEXT:    sw zero, 56(sp)
+; RV32ZVBC32-NEXT:    sw t6, 60(sp)
+; RV32ZVBC32-NEXT:    sw zero, 48(sp)
+; RV32ZVBC32-NEXT:    sw s0, 52(sp)
+; RV32ZVBC32-NEXT:    sw zero, 40(sp)
+; RV32ZVBC32-NEXT:    sw s6, 44(sp)
+; RV32ZVBC32-NEXT:    sw zero, 32(sp)
+; RV32ZVBC32-NEXT:    sw s7, 36(sp)
+; RV32ZVBC32-NEXT:    sw zero, 24(sp)
+; RV32ZVBC32-NEXT:    sw s8, 28(sp)
+; RV32ZVBC32-NEXT:    sw zero, 16(sp)
+; RV32ZVBC32-NEXT:    sw a0, 20(sp)
+; RV32ZVBC32-NEXT:    lui a0, 61681
+; RV32ZVBC32-NEXT:    addi a0, a0, -241
+; RV32ZVBC32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; RV32ZVBC32-NEXT:    vmv.v.x v9, a0
+; RV32ZVBC32-NEXT:    lui a0, 209715
+; RV32ZVBC32-NEXT:    addi a0, a0, 819
+; RV32ZVBC32-NEXT:    vmv.v.x v10, a0
+; RV32ZVBC32-NEXT:    lui a0, 349525
+; RV32ZVBC32-NEXT:    addi a0, a0, 1365
+; RV32ZVBC32-NEXT:    vmv.v.x v11, a0
+; RV32ZVBC32-NEXT:    addi a0, sp, 248
+; RV32ZVBC32-NEXT:    vsetvli a1, zero, e64, m1, ta, mu
+; RV32ZVBC32-NEXT:    vlse64.v v12, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 8
+; RV32ZVBC32-NEXT:    vlse64.v v6, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 272
+; RV32ZVBC32-NEXT:    vlse64.v v7, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 256
+; RV32ZVBC32-NEXT:    vlse64.v v31, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 264
+; RV32ZVBC32-NEXT:    vlse64.v v30, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 240
+; RV32ZVBC32-NEXT:    vlse64.v v29, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 232
+; RV32ZVBC32-NEXT:    vlse64.v v28, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 224
+; RV32ZVBC32-NEXT:    vlse64.v v27, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 216
+; RV32ZVBC32-NEXT:    vlse64.v v22, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 208
+; RV32ZVBC32-NEXT:    vlse64.v v19, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 200
+; RV32ZVBC32-NEXT:    vlse64.v v18, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 192
+; RV32ZVBC32-NEXT:    vlse64.v v23, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 184
+; RV32ZVBC32-NEXT:    vlse64.v v16, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 176
+; RV32ZVBC32-NEXT:    vlse64.v v14, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 168
+; RV32ZVBC32-NEXT:    vlse64.v v15, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 160
+; RV32ZVBC32-NEXT:    vlse64.v v17, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 152
+; RV32ZVBC32-NEXT:    vlse64.v v20, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 144
+; RV32ZVBC32-NEXT:    vlse64.v v25, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 136
+; RV32ZVBC32-NEXT:    vlse64.v v21, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 128
+; RV32ZVBC32-NEXT:    vlse64.v v24, (a0), zero
+; RV32ZVBC32-NEXT:    addi a0, sp, 120
+; RV32ZVBC32-NEXT:    vlse64.v v26, (a0), zero
+; RV32ZVBC32-NEXT:    li ra, 56
+; RV32ZVBC32-NEXT:    vsrl.vx v5, v13, ra
+; RV32ZVBC32-NEXT:    li a2, 40
+; RV32ZVBC32-NEXT:    vsrl.vx v4, v13, a2
+; RV32ZVBC32-NEXT:    vsrl.vi v3, v13, 24
+; RV32ZVBC32-NEXT:    vsrl.vi v2, v13, 8
+; RV32ZVBC32-NEXT:    vsll.vx v1, v13, ra
+; RV32ZVBC32-NEXT:    addi a0, t5, -256
+; RV32ZVBC32-NEXT:    vand.vx v4, v4, a0
+; RV32ZVBC32-NEXT:    vor.vv v5, v4, v5
+; RV32ZVBC32-NEXT:    vand.vx v4, v13, a0
+; RV32ZVBC32-NEXT:    vsll.vx v4, v4, a2
+; RV32ZVBC32-NEXT:    vor.vv v4, v1, v4
+; RV32ZVBC32-NEXT:    lui a1, 4080
+; RV32ZVBC32-NEXT:    vand.vx v1, v13, a1
+; RV32ZVBC32-NEXT:    vand.vx v3, v3, a1
+; RV32ZVBC32-NEXT:    vsll.vi v1, v1, 24
+; RV32ZVBC32-NEXT:    vand.vv v2, v2, v12
+; RV32ZVBC32-NEXT:    vand.vv v13, v13, v12
+; RV32ZVBC32-NEXT:    vor.vv v3, v2, v3
+; RV32ZVBC32-NEXT:    vsll.vi v13, v13, 8
+; RV32ZVBC32-NEXT:    vor.vv v5, v3, v5
+; RV32ZVBC32-NEXT:    vor.vv v13, v1, v13
+; RV32ZVBC32-NEXT:    vor.vv v13, v4, v13
+; RV32ZVBC32-NEXT:    vor.vv v13, v13, v5
+; RV32ZVBC32-NEXT:    vsrl.vi v5, v13, 4
+; RV32ZVBC32-NEXT:    vand.vv v13, v13, v9
+; RV32ZVBC32-NEXT:    vand.vv v5, v5, v9
+; RV32ZVBC32-NEXT:    vsll.vi v13, v13, 4
+; RV32ZVBC32-NEXT:    vor.vv v13, v5, v13
+; RV32ZVBC32-NEXT:    vsrl.vi v5, v13, 2
+; RV32ZVBC32-NEXT:    vand.vv v13, v13, v10
+; RV32ZVBC32-NEXT:    vand.vv v5, v5, v10
+; RV32ZVBC32-NEXT:    vsll.vi v13, v13, 2
+; RV32ZVBC32-NEXT:    vor.vv v13, v5, v13
+; RV32ZVBC32-NEXT:    vsrl.vi v5, v13, 1
+; RV32ZVBC32-NEXT:    vand.vv v13, v13, v11
+; RV32ZVBC32-NEXT:    vand.vv v5, v5, v11
+; RV32ZVBC32-NEXT:    vadd.vv v13, v13, v13
+; RV32ZVBC32-NEXT:    vor.vv v13, v5, v13
+; RV32ZVBC32-NEXT:    vand.vx v4, v13, s3
+; RV32ZVBC32-NEXT:    vsrl.vi v5, v8, 24
+; RV32ZVBC32-NEXT:    vsrl.vx v3, v8, ra
+; RV32ZVBC32-NEXT:    vsrl.vx v2, v8, a2
+; RV32ZVBC32-NEXT:    vsll.vx v1, v8, ra
+; RV32ZVBC32-NEXT:    vand.vx v2, v2, a0
+; RV32ZVBC32-NEXT:    vor.vv v3, v2, v3
+; RV32ZVBC32-NEXT:    vand.vx v2, v8, a0
+; RV32ZVBC32-NEXT:    vsll.vx v2, v2, a2
+; RV32ZVBC32-NEXT:    vor.vv v2, v1, v2
+; RV32ZVBC32-NEXT:    vsrl.vi v1, v8, 8
+; RV32ZVBC32-NEXT:    vand.vx v5, v5, a1
+; RV32ZVBC32-NEXT:    vand.vv v1, v1, v12
+; RV32ZVBC32-NEXT:    vor.vv v5, v1, v5
+; RV32ZVBC32-NEXT:    vand.vx v1, v8, a1
+; RV32ZVBC32-NEXT:    vsll.vi v1, v1, 24
+; RV32ZVBC32-NEXT:    vor.vv v5, v5, v3
+; RV32ZVBC32-NEXT:    vand.vv v3, v8, v12
+; RV32ZVBC32-NEXT:    vsll.vi v3, v3, 8
+; RV32ZVBC32-NEXT:    vor.vv v3, v1, v3
+; RV32ZVBC32-NEXT:    li s9, 32
+; RV32ZVBC32-NEXT:    vand.vx v1, v13, s9
+; RV32ZVBC32-NEXT:    vor.vv v3, v2, v3
+; RV32ZVBC32-NEXT:    vor.vv v5, v3, v5
+; RV32ZVBC32-NEXT:    vsrl.vi v3, v5, 4
+; RV32ZVBC32-NEXT:    vand.vv v5, v5, v9
+; RV32ZVBC32-NEXT:    vand.vv v3, v3, v9
+; RV32ZVBC32-NEXT:    vsll.vi v5, v5, 4
+; RV32ZVBC32-NEXT:    vor.vv v5, v3, v5
+; RV32ZVBC32-NEXT:    vsrl.vi v3, v5, 2
+; RV32ZVBC32-NEXT:    vand.vv v5, v5, v10
+; RV32ZVBC32-NEXT:    vand.vv v3, v3, v10
+; RV32ZVBC32-NEXT:    vsll.vi v5, v5, 2
+; RV32ZVBC32-NEXT:    vor.vv v5, v3, v5
+; RV32ZVBC32-NEXT:    vsrl.vi v3, v5, 1
+; RV32ZVBC32-NEXT:    vand.vv v5, v5, v11
+; RV32ZVBC32-NEXT:    vand.vv v3, v3, v11
+; RV32ZVBC32-NEXT:    vadd.vv v5, v5, v5
+; RV32ZVBC32-NEXT:    vor.vv v5, v3, v5
+; RV32ZVBC32-NEXT:    vand.vi v3, v13, 2
+; RV32ZVBC32-NEXT:    vand.vi v2, v13, 1
+; RV32ZVBC32-NEXT:    vmul.vv v3, v5, v3
+; RV32ZVBC32-NEXT:    vmul.vv v2, v5, v2
+; RV32ZVBC32-NEXT:    vxor.vv v3, v2, v3
+; RV32ZVBC32-NEXT:    vand.vi v2, v13, 4
+; RV32ZVBC32-NEXT:    vmul.vv v2, v5, v2
+; RV32ZVBC32-NEXT:    vxor.vv v3, v3, v2
+; RV32ZVBC32-NEXT:    vand.vi v2, v13, 8
+; RV32ZVBC32-NEXT:    vmul.vv v2, v5, v2
+; RV32ZVBC32-NEXT:    vxor.vv v3, v3, v2
+; RV32ZVBC32-NEXT:    vand.vx v2, v13, s10
+; RV32ZVBC32-NEXT:    vmul.vv v4, v5, v4
+; RV32ZVBC32-NEXT:    vxor.vv v4, v3, v4
+; RV32ZVBC32-NEXT:    li s3, 128
+; RV32ZVBC32-NEXT:    vand.vx v3, v13, s3
+; RV32ZVBC32-NEXT:    vmul.vv v1, v5, v1
+; RV32ZVBC32-NEXT:    vxor.vv v4, v4, v1
+; RV32ZVBC32-NEXT:    li s3, 256
+; RV32ZVBC32-NEXT:    vand.vx v1, v13, s3
+; RV32ZVBC32-NEXT:    vmul.vv v2, v5, v2
+; RV32ZVBC32-NEXT:    vxor.vv v4, v4, v2
+; RV32ZVBC32-NEXT:    vand.vx v2, v13, s4
+; RV32ZVBC32-NEXT:    vmul.vv v3, v5, v3
+; RV32ZVBC32-NEXT:    vxor.vv v4, v4, v3
+; RV32ZVBC32-NEXT:    vand.vx v3, v13, s2
+; RV32ZVBC32-NEXT:    vmul.vv v1, v5, v1
+; RV32ZVBC32-NEXT:    vmul.vv v2, v5, v2
+; RV32ZVBC32-NEXT:    vxor.vv v1, v4, v1
+; RV32ZVBC32-NEXT:    vxor.vv v2, v1, v2
+; RV32ZVBC32-NEXT:    vand.vx v1, v13, s11
+; RV32ZVBC32-NEXT:    vmul.vv v3, v5, v3
+; RV32ZVBC32-NEXT:    vxor.vv v3, v2, v3
+; RV32ZVBC32-NEXT:    lui s2, 1
+; RV32ZVBC32-NEXT:    vand.vx v2, v13, s2
+; RV32ZVBC32-NEXT:    vmul.vv v1, v5, v1
+; RV32ZVBC32-NEXT:    vxor.vv v3, v3, v1
+; RV32ZVBC32-NEXT:    vand.vx v1, v13, s5
+; RV32ZVBC32-NEXT:    vmul.vv v2, v5, v2
+; RV32ZVBC32-NEXT:    vxor.vv v3, v3, v2
+; RV32ZVBC32-NEXT:    vand.vx v2, v13, s1
+; RV32ZVBC32-NEXT:    vmul.vv v1, v5, v1
+; RV32ZVBC32-NEXT:    vxor.vv v3, v3, v1
+; RV32ZVBC32-NEXT:    vand.vx v1, v13, t4
+; RV32ZVBC32-NEXT:    vmul.vv v2, v5, v2
+; RV32ZVBC32-NEXT:    vxor.vv v3, v3, v2
+; RV32ZVBC32-NEXT:    vand.vx v2, v13, t5
+; RV32ZVBC32-NEXT:    vmul.vv v1, v5, v1
+; RV32ZVBC32-NEXT:    vxor.vv v3, v3, v1
+; RV32ZVBC32-NEXT:    vand.vx v1, v13, t3
+; RV32ZVBC32-NEXT:    vmul.vv v2, v5, v2
+; RV32ZVBC32-NEXT:    vxor.vv v3, v3, v2
+; RV32ZVBC32-NEXT:    vand.vx v2, v13, t1
+; RV32ZVBC32-NEXT:    vmul.vv v1, v5, v1
+; RV32ZVBC32-NEXT:    vxor.vv v3, v3, v1
+; RV32ZVBC32-NEXT:    vand.vx v1, v13, t2
+; RV32ZVBC32-NEXT:    vmul.vv v2, v5, v2
+; RV32ZVBC32-NEXT:    vxor.vv v3, v3, v2
+; RV32ZVBC32-NEXT:    vand.vx v2, v13, a6
+; RV32ZVBC32-NEXT:    vmul.vv v1, v5, v1
+; RV32ZVBC32-NEXT:    vxor.vv v3, v3, v1
+; RV32ZVBC32-NEXT:    vand.vx v1, v13, t0
+; RV32ZVBC32-NEXT:    vmul.vv v2, v5, v2
+; RV32ZVBC32-NEXT:    vxor.vv v3, v3, v2
+; RV32ZVBC32-NEXT:    vand.vx v2, v13, a7
+; RV32ZVBC32-NEXT:    vmul.vv v1, v5, v1
+; RV32ZVBC32-NEXT:    vxor.vv v3, v3, v1
+; RV32ZVBC32-NEXT:    vand.vx v1, v13, a4
+; RV32ZVBC32-NEXT:    vmul.vv v2, v5, v2
+; RV32ZVBC32-NEXT:    vxor.vv v3, v3, v2
+; RV32ZVBC32-NEXT:    vand.vx v2, v13, a5
+; RV32ZVBC32-NEXT:    vmul.vv v1, v5, v1
+; RV32ZVBC32-NEXT:    vxor.vv v3, v3, v1
+; RV32ZVBC32-NEXT:    vand.vx v1, v13, a3
+; RV32ZVBC32-NEXT:    vmul.vv v2, v5, v2
+; RV32ZVBC32-NEXT:    vmul.vv v1, v5, v1
+; RV32ZVBC32-NEXT:    vxor.vv v2, v3, v2
+; RV32ZVBC32-NEXT:    vxor.vv v2, v2, v1
+; RV32ZVBC32-NEXT:    vand.vx v1, v13, t6
+; RV32ZVBC32-NEXT:    vmul.vv v1, v5, v1
+; RV32ZVBC32-NEXT:    vxor.vv v2, v2, v1
+; RV32ZVBC32-NEXT:    vand.vx v1, v13, s0
+; RV32ZVBC32-NEXT:    vmul.vv v1, v5, v1
+; RV32ZVBC32-NEXT:    vxor.vv v2, v2, v1
+; RV32ZVBC32-NEXT:    vand.vx v1, v13, s6
+; RV32ZVBC32-NEXT:    vmul.vv v1, v5, v1
+; RV32ZVBC32-NEXT:    vxor.vv v2, v2, v1
+; RV32ZVBC32-NEXT:    vand.vx v1, v13, s7
+; RV32ZVBC32-NEXT:    vmul.vv v1, v5, v1
+; RV32ZVBC32-NEXT:    vxor.vv v2, v2, v1
+; RV32ZVBC32-NEXT:    vand.vx v1, v13, s8
+; RV32ZVBC32-NEXT:    addi a3, sp, 112
+; RV32ZVBC32-NEXT:    vmul.vv v1, v5, v1
+; RV32ZVBC32-NEXT:    vxor.vv v1, v2, v1
+; RV32ZVBC32-NEXT:    vlse64.v v2, (a3), zero
+; RV32ZVBC32-NEXT:    addi a3, sp, 104
+; RV32ZVBC32-NEXT:    vand.vv v6, v13, v6
+; RV32ZVBC32-NEXT:    vmul.vv v6, v5, v6
+; RV32ZVBC32-NEXT:    vxor.vv v1, v1, v6
+; RV32ZVBC32-NEXT:    vlse64.v v6, (a3), zero
+; RV32ZVBC32-NEXT:    addi a3, sp, 96
+; RV32ZVBC32-NEXT:    vand.vv v7, v13, v7
+; RV32ZVBC32-NEXT:    vmul.vv v7, v5, v7
+; RV32ZVBC32-NEXT:    vxor.vv v1, v1, v7
+; RV32ZVBC32-NEXT:    vlse64.v v7, (a3), zero
+; RV32ZVBC32-NEXT:    addi a3, sp, 88
+; RV32ZVBC32-NEXT:    vand.vv v31, v13, v31
+; RV32ZVBC32-NEXT:    vmul.vv v31, v5, v31
+; RV32ZVBC32-NEXT:    vxor.vv v1, v1, v31
+; RV32ZVBC32-NEXT:    vlse64.v v31, (a3), zero
+; RV32ZVBC32-NEXT:    addi a3, sp, 80
+; RV32ZVBC32-NEXT:    vand.vv v30, v13, v30
+; RV32ZVBC32-NEXT:    vmul.vv v30, v5, v30
+; RV32ZVBC32-NEXT:    vxor.vv v1, v1, v30
+; RV32ZVBC32-NEXT:    vlse64.v v30, (a3), zero
+; RV32ZVBC32-NEXT:    addi a3, sp, 72
+; RV32ZVBC32-NEXT:    vand.vv v29, v13, v29
+; RV32ZVBC32-NEXT:    vmul.vv v29, v5, v29
+; RV32ZVBC32-NEXT:    vxor.vv v1, v1, v29
+; RV32ZVBC32-NEXT:    vlse64.v v29, (a3), zero
+; RV32ZVBC32-NEXT:    addi a3, sp, 64
+; RV32ZVBC32-NEXT:    vand.vv v28, v13, v28
+; RV32ZVBC32-NEXT:    vmul.vv v28, v5, v28
+; RV32ZVBC32-NEXT:    vxor.vv v1, v1, v28
+; RV32ZVBC32-NEXT:    vlse64.v v28, (a3), zero
+; RV32ZVBC32-NEXT:    addi a3, sp, 56
+; RV32ZVBC32-NEXT:    vand.vv v27, v13, v27
+; RV32ZVBC32-NEXT:    vmul.vv v27, v5, v27
+; RV32ZVBC32-NEXT:    vxor.vv v1, v1, v27
+; RV32ZVBC32-NEXT:    vlse64.v v27, (a3), zero
+; RV32ZVBC32-NEXT:    addi a3, sp, 48
+; RV32ZVBC32-NEXT:    vand.vv v22, v13, v22
+; RV32ZVBC32-NEXT:    vmul.vv v22, v5, v22
+; RV32ZVBC32-NEXT:    vxor.vv v1, v1, v22
+; RV32ZVBC32-NEXT:    vlse64.v v22, (a3), zero
+; RV32ZVBC32-NEXT:    addi a3, sp, 40
+; RV32ZVBC32-NEXT:    vand.vv v19, v13, v19
+; RV32ZVBC32-NEXT:    vmul.vv v19, v5, v19
+; RV32ZVBC32-NEXT:    vxor.vv v19, v1, v19
+; RV32ZVBC32-NEXT:    vlse64.v v1, (a3), zero
+; RV32ZVBC32-NEXT:    addi a3, sp, 32
+; RV32ZVBC32-NEXT:    vand.vv v18, v13, v18
+; RV32ZVBC32-NEXT:    vand.vv v23, v13, v23
+; RV32ZVBC32-NEXT:    vmul.vv v18, v5, v18
+; RV32ZVBC32-NEXT:    vmul.vv v23, v5, v23
+; RV32ZVBC32-NEXT:    vxor.vv v18, v19, v18
+; RV32ZVBC32-NEXT:    vxor.vv v23, v18, v23
+; RV32ZVBC32-NEXT:    vlse64.v v18, (a3), zero
+; RV32ZVBC32-NEXT:    addi a3, sp, 24
+; RV32ZVBC32-NEXT:    vand.vv v16, v13, v16
+; RV32ZVBC32-NEXT:    vmul.vv v16, v5, v16
+; RV32ZVBC32-NEXT:    vxor.vv v16, v23, v16
+; RV32ZVBC32-NEXT:    vlse64.v v23, (a3), zero
+; RV32ZVBC32-NEXT:    addi a3, sp, 16
+; RV32ZVBC32-NEXT:    vand.vv v14, v13, v14
+; RV32ZVBC32-NEXT:    vmul.vv v14, v5, v14
+; RV32ZVBC32-NEXT:    vxor.vv v14, v16, v14
+; RV32ZVBC32-NEXT:    vlse64.v v16, (a3), zero
+; RV32ZVBC32-NEXT:    vand.vv v15, v13, v15
+; RV32ZVBC32-NEXT:    vmul.vv v15, v5, v15
+; RV32ZVBC32-NEXT:    vxor.vv v14, v14, v15
+; RV32ZVBC32-NEXT:    vand.vv v15, v13, v17
+; RV32ZVBC32-NEXT:    vmul.vv v15, v5, v15
+; RV32ZVBC32-NEXT:    vxor.vv v14, v14, v15
+; RV32ZVBC32-NEXT:    vand.vv v15, v13, v20
+; RV32ZVBC32-NEXT:    vand.vv v17, v13, v25
+; RV32ZVBC32-NEXT:    vmul.vv v15, v5, v15
+; RV32ZVBC32-NEXT:    vmul.vv v17, v5, v17
+; RV32ZVBC32-NEXT:    vxor.vv v14, v14, v15
+; RV32ZVBC32-NEXT:    vand.vx v15, v19, a1
+; RV32ZVBC32-NEXT:    vxor.vv v14, v14, v17
+; RV32ZVBC32-NEXT:    vsrl.vi v17, v14, 24
+; RV32ZVBC32-NEXT:    vand.vx v17, v17, a1
+; RV32ZVBC32-NEXT:    vand.vv v20, v13, v21
+; RV32ZVBC32-NEXT:    vmul.vv v20, v5, v20
+; RV32ZVBC32-NEXT:    vxor.vv v14, v14, v20
+; RV32ZVBC32-NEXT:    vand.vv v20, v13, v24
+; RV32ZVBC32-NEXT:    vmul.vv v20, v5, v20
+; RV32ZVBC32-NEXT:    vxor.vv v14, v14, v20
+; RV32ZVBC32-NEXT:    vand.vv v20, v13, v26
+; RV32ZVBC32-NEXT:    vmul.vv v20, v5, v20
+; RV32ZVBC32-NEXT:    vxor.vv v14, v14, v20
+; RV32ZVBC32-NEXT:    vand.vv v20, v13, v2
+; RV32ZVBC32-NEXT:    vmul.vv v20, v5, v20
+; RV32ZVBC32-NEXT:    vxor.vv v14, v14, v20
+; RV32ZVBC32-NEXT:    vand.vv v20, v13, v6
+; RV32ZVBC32-NEXT:    vmul.vv v20, v5, v20
+; RV32ZVBC32-NEXT:    vxor.vv v14, v14, v20
+; RV32ZVBC32-NEXT:    vand.vv v20, v13, v7
+; RV32ZVBC32-NEXT:    vmul.vv v20, v5, v20
+; RV32ZVBC32-NEXT:    vxor.vv v14, v14, v20
+; RV32ZVBC32-NEXT:    vand.vv v20, v13, v31
+; RV32ZVBC32-NEXT:    vmul.vv v20, v5, v20
+; RV32ZVBC32-NEXT:    vxor.vv v14, v14, v20
+; RV32ZVBC32-NEXT:    vand.vv v20, v13, v30
+; RV32ZVBC32-NEXT:    vmul.vv v20, v5, v20
+; RV32ZVBC32-NEXT:    vand.vx v21, v3, a0
+; RV32ZVBC32-NEXT:    vsll.vx v21, v21, a2
+; RV32ZVBC32-NEXT:    vxor.vv v14, v14, v20
+; RV32ZVBC32-NEXT:    vsrl.vx v20, v14, a2
+; RV32ZVBC32-NEXT:    vand.vx v20, v20, a0
+; RV32ZVBC32-NEXT:    vand.vv v24, v13, v29
+; RV32ZVBC32-NEXT:    vand.vv v25, v13, v28
+; RV32ZVBC32-NEXT:    vand.vv v26, v13, v27
+; RV32ZVBC32-NEXT:    vand.vv v22, v13, v22
+; RV32ZVBC32-NEXT:    vand.vv v27, v13, v1
+; RV32ZVBC32-NEXT:    vand.vv v18, v13, v18
+; RV32ZVBC32-NEXT:    vand.vv v23, v13, v23
+; RV32ZVBC32-NEXT:    vand.vv v13, v13, v16
+; RV32ZVBC32-NEXT:    vmul.vv v16, v5, v24
+; RV32ZVBC32-NEXT:    vmul.vv v24, v5, v25
+; RV32ZVBC32-NEXT:    vmul.vv v25, v5, v26
+; RV32ZVBC32-NEXT:    vmul.vv v22, v5, v22
+; RV32ZVBC32-NEXT:    vmul.vv v26, v5, v27
+; RV32ZVBC32-NEXT:    vmul.vv v18, v5, v18
+; RV32ZVBC32-NEXT:    vmul.vv v23, v5, v23
+; RV32ZVBC32-NEXT:    vmul.vv v13, v5, v13
+; RV32ZVBC32-NEXT:    vxor.vv v16, v14, v16
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v25
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v22
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v26
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v18
+; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v23
+; RV32ZVBC32-NEXT:    vxor.vv v13, v16, v13
+; RV32ZVBC32-NEXT:    vsll.vx v16, v4, ra
+; RV32ZVBC32-NEXT:    vsrl.vx v13, v13, ra
+; RV32ZVBC32-NEXT:    vor.vv v16, v16, v21
+; RV32ZVBC32-NEXT:    vsrl.vi v18, v19, 8
+; RV32ZVBC32-NEXT:    vsll.vi v15, v15, 24
+; RV32ZVBC32-NEXT:    vand.vv v18, v18, v12
+; RV32ZVBC32-NEXT:    vor.vv v17, v18, v17
+; RV32ZVBC32-NEXT:    vand.vv v12, v14, v12
+; RV32ZVBC32-NEXT:    vsll.vi v12, v12, 8
+; RV32ZVBC32-NEXT:    vor.vv v12, v15, v12
+; RV32ZVBC32-NEXT:    vor.vv v12, v16, v12
+; RV32ZVBC32-NEXT:    vor.vv v13, v20, v13
+; RV32ZVBC32-NEXT:    vor.vv v13, v17, v13
+; RV32ZVBC32-NEXT:    vor.vv v12, v12, v13
+; RV32ZVBC32-NEXT:    vsrl.vi v13, v12, 4
+; RV32ZVBC32-NEXT:    vand.vv v12, v12, v9
+; RV32ZVBC32-NEXT:    vand.vv v9, v13, v9
+; RV32ZVBC32-NEXT:    vsll.vi v12, v12, 4
+; RV32ZVBC32-NEXT:    vor.vv v9, v9, v12
+; RV32ZVBC32-NEXT:    vsrl.vi v12, v9, 2
+; RV32ZVBC32-NEXT:    vand.vv v9, v9, v10
+; RV32ZVBC32-NEXT:    vand.vv v10, v12, v10
+; RV32ZVBC32-NEXT:    vsll.vi v9, v9, 2
+; RV32ZVBC32-NEXT:    vor.vv v9, v10, v9
+; RV32ZVBC32-NEXT:    vsrl.vi v10, v9, 1
+; RV32ZVBC32-NEXT:    vand.vv v9, v9, v11
+; RV32ZVBC32-NEXT:    vand.vv v10, v10, v11
+; RV32ZVBC32-NEXT:    vadd.vv v9, v9, v9
+; RV32ZVBC32-NEXT:    vor.vv v9, v10, v9
+; RV32ZVBC32-NEXT:    vsrl.vi v8, v9, 1, v0.t
+; RV32ZVBC32-NEXT:    lw ra, 332(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s0, 328(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s1, 324(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s2, 320(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s3, 316(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s4, 312(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s5, 308(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s6, 304(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s7, 300(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s8, 296(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s9, 292(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s10, 288(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    lw s11, 284(sp) # 4-byte Folded Reload
+; RV32ZVBC32-NEXT:    .cfi_restore ra
+; RV32ZVBC32-NEXT:    .cfi_restore s0
+; RV32ZVBC32-NEXT:    .cfi_restore s1
+; RV32ZVBC32-NEXT:    .cfi_restore s2
+; RV32ZVBC32-NEXT:    .cfi_restore s3
+; RV32ZVBC32-NEXT:    .cfi_restore s4
+; RV32ZVBC32-NEXT:    .cfi_restore s5
+; RV32ZVBC32-NEXT:    .cfi_restore s6
+; RV32ZVBC32-NEXT:    .cfi_restore s7
+; RV32ZVBC32-NEXT:    .cfi_restore s8
+; RV32ZVBC32-NEXT:    .cfi_restore s9
+; RV32ZVBC32-NEXT:    .cfi_restore s10
+; RV32ZVBC32-NEXT:    .cfi_restore s11
+; RV32ZVBC32-NEXT:    addi sp, sp, 336
+; RV32ZVBC32-NEXT:    .cfi_def_cfa_offset 0
+; RV32ZVBC32-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmulh_nxv1i64_vx_mask:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a1, zero, e64, m1, ta, mu
-; RV64ZVBC-NEXT:    vclmulh.vx v8, v8, a0, v0.t
-; RV64ZVBC-NEXT:    ret
+; RV64ZVBC32-LABEL: clmulh_nxv1i64_vx_mask:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    li a1, 56
+; RV64ZVBC32-NEXT:    lui t2, 16
+; RV64ZVBC32-NEXT:    lui a2, 4080
+; RV64ZVBC32-NEXT:    li t0, 255
+; RV64ZVBC32-NEXT:    lui a3, 61681
+; RV64ZVBC32-NEXT:    lui a4, 209715
+; RV64ZVBC32-NEXT:    lui a5, 349525
+; RV64ZVBC32-NEXT:    srli a6, a0, 24
+; RV64ZVBC32-NEXT:    srli a7, a0, 8
+; RV64ZVBC32-NEXT:    srli t1, a0, 40
+; RV64ZVBC32-NEXT:    srli t3, a0, 56
+; RV64ZVBC32-NEXT:    addi a3, a3, -241
+; RV64ZVBC32-NEXT:    addi a4, a4, 819
+; RV64ZVBC32-NEXT:    addi t4, a5, 1365
+; RV64ZVBC32-NEXT:    slli a5, a3, 32
+; RV64ZVBC32-NEXT:    add a5, a3, a5
+; RV64ZVBC32-NEXT:    slli a3, a4, 32
+; RV64ZVBC32-NEXT:    add a4, a4, a3
+; RV64ZVBC32-NEXT:    slli a3, t4, 32
+; RV64ZVBC32-NEXT:    add a3, t4, a3
+; RV64ZVBC32-NEXT:    srliw t4, a0, 24
+; RV64ZVBC32-NEXT:    slli t0, t0, 24
+; RV64ZVBC32-NEXT:    and a6, a6, a2
+; RV64ZVBC32-NEXT:    and a7, a7, t0
+; RV64ZVBC32-NEXT:    or t5, a7, a6
+; RV64ZVBC32-NEXT:    addi a6, t2, -256
+; RV64ZVBC32-NEXT:    and a7, t1, a6
+; RV64ZVBC32-NEXT:    or t1, a7, t3
+; RV64ZVBC32-NEXT:    and a7, a0, a2
+; RV64ZVBC32-NEXT:    slli t4, t4, 32
+; RV64ZVBC32-NEXT:    slli a7, a7, 24
+; RV64ZVBC32-NEXT:    or t3, a7, t4
+; RV64ZVBC32-NEXT:    li a7, 40
+; RV64ZVBC32-NEXT:    vsetvli t4, zero, e64, m1, ta, mu
+; RV64ZVBC32-NEXT:    vsrl.vi v10, v8, 24
+; RV64ZVBC32-NEXT:    vsrl.vi v9, v8, 8
+; RV64ZVBC32-NEXT:    or t1, t5, t1
+; RV64ZVBC32-NEXT:    slli t4, a0, 56
+; RV64ZVBC32-NEXT:    and a0, a0, a6
+; RV64ZVBC32-NEXT:    slli a0, a0, 40
+; RV64ZVBC32-NEXT:    or t4, t4, a0
+; RV64ZVBC32-NEXT:    li a0, 1
+; RV64ZVBC32-NEXT:    or t4, t4, t3
+; RV64ZVBC32-NEXT:    lui t3, 1
+; RV64ZVBC32-NEXT:    vsrl.vx v11, v8, a1
+; RV64ZVBC32-NEXT:    vsrl.vx v12, v8, a7
+; RV64ZVBC32-NEXT:    vand.vx v10, v10, a2
+; RV64ZVBC32-NEXT:    vand.vx v13, v8, a2
+; RV64ZVBC32-NEXT:    vsll.vx v14, v8, a1
+; RV64ZVBC32-NEXT:    vand.vx v12, v12, a6
+; RV64ZVBC32-NEXT:    vand.vx v9, v9, t0
+; RV64ZVBC32-NEXT:    vsll.vi v13, v13, 24
+; RV64ZVBC32-NEXT:    vand.vx v15, v8, t0
+; RV64ZVBC32-NEXT:    vor.vv v11, v12, v11
+; RV64ZVBC32-NEXT:    vand.vx v12, v8, a6
+; RV64ZVBC32-NEXT:    vor.vv v9, v9, v10
+; RV64ZVBC32-NEXT:    vsll.vi v10, v15, 8
+; RV64ZVBC32-NEXT:    vsll.vx v12, v12, a7
+; RV64ZVBC32-NEXT:    vor.vv v9, v9, v11
+; RV64ZVBC32-NEXT:    vor.vv v10, v13, v10
+; RV64ZVBC32-NEXT:    vor.vv v11, v14, v12
+; RV64ZVBC32-NEXT:    vor.vv v10, v11, v10
+; RV64ZVBC32-NEXT:    vor.vv v9, v10, v9
+; RV64ZVBC32-NEXT:    or t1, t4, t1
+; RV64ZVBC32-NEXT:    vsrl.vi v10, v9, 4
+; RV64ZVBC32-NEXT:    vand.vx v9, v9, a5
+; RV64ZVBC32-NEXT:    srli t4, t1, 4
+; RV64ZVBC32-NEXT:    and t1, t1, a5
+; RV64ZVBC32-NEXT:    vand.vx v10, v10, a5
+; RV64ZVBC32-NEXT:    vsll.vi v9, v9, 4
+; RV64ZVBC32-NEXT:    and t4, t4, a5
+; RV64ZVBC32-NEXT:    slli t1, t1, 4
+; RV64ZVBC32-NEXT:    vor.vv v9, v10, v9
+; RV64ZVBC32-NEXT:    or t1, t4, t1
+; RV64ZVBC32-NEXT:    vsrl.vi v10, v9, 2
+; RV64ZVBC32-NEXT:    vand.vx v9, v9, a4
+; RV64ZVBC32-NEXT:    srli t4, t1, 2
+; RV64ZVBC32-NEXT:    and t1, t1, a4
+; RV64ZVBC32-NEXT:    vand.vx v10, v10, a4
+; RV64ZVBC32-NEXT:    vsll.vi v9, v9, 2
+; RV64ZVBC32-NEXT:    and t4, t4, a4
+; RV64ZVBC32-NEXT:    slli t1, t1, 2
+; RV64ZVBC32-NEXT:    vor.vv v9, v10, v9
+; RV64ZVBC32-NEXT:    or t1, t4, t1
+; RV64ZVBC32-NEXT:    vsrl.vi v10, v9, 1
+; RV64ZVBC32-NEXT:    vand.vx v9, v9, a3
+; RV64ZVBC32-NEXT:    srli t4, t1, 1
+; RV64ZVBC32-NEXT:    and t1, t1, a3
+; RV64ZVBC32-NEXT:    vand.vx v10, v10, a3
+; RV64ZVBC32-NEXT:    vadd.vv v9, v9, v9
+; RV64ZVBC32-NEXT:    and t4, t4, a3
+; RV64ZVBC32-NEXT:    slli t1, t1, 1
+; RV64ZVBC32-NEXT:    vor.vv v9, v10, v9
+; RV64ZVBC32-NEXT:    or t1, t4, t1
+; RV64ZVBC32-NEXT:    andi t4, t1, 2
+; RV64ZVBC32-NEXT:    vmul.vx v10, v9, t4
+; RV64ZVBC32-NEXT:    andi t4, t1, 1
+; RV64ZVBC32-NEXT:    vmul.vx v11, v9, t4
+; RV64ZVBC32-NEXT:    andi t4, t1, 4
+; RV64ZVBC32-NEXT:    vmul.vx v12, v9, t4
+; RV64ZVBC32-NEXT:    andi t4, t1, 8
+; RV64ZVBC32-NEXT:    vmul.vx v13, v9, t4
+; RV64ZVBC32-NEXT:    andi t4, t1, 16
+; RV64ZVBC32-NEXT:    vmul.vx v14, v9, t4
+; RV64ZVBC32-NEXT:    andi t4, t1, 32
+; RV64ZVBC32-NEXT:    vmul.vx v15, v9, t4
+; RV64ZVBC32-NEXT:    andi t4, t1, 64
+; RV64ZVBC32-NEXT:    vxor.vv v10, v11, v10
+; RV64ZVBC32-NEXT:    vmul.vx v11, v9, t4
+; RV64ZVBC32-NEXT:    andi t4, t1, 128
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v9, t4
+; RV64ZVBC32-NEXT:    andi t4, t1, 256
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v13
+; RV64ZVBC32-NEXT:    vmul.vx v13, v9, t4
+; RV64ZVBC32-NEXT:    andi t4, t1, 512
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v14
+; RV64ZVBC32-NEXT:    vmul.vx v14, v9, t4
+; RV64ZVBC32-NEXT:    andi t4, t1, 1024
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v15
+; RV64ZVBC32-NEXT:    vmul.vx v15, v9, t4
+; RV64ZVBC32-NEXT:    slli t4, a0, 11
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vmul.vx v11, v9, t4
+; RV64ZVBC32-NEXT:    lui t4, 2
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v9, t3
+; RV64ZVBC32-NEXT:    lui t3, 4
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v13, v10, v13
+; RV64ZVBC32-NEXT:    vxor.vv v13, v13, v14
+; RV64ZVBC32-NEXT:    vmul.vx v14, v9, t4
+; RV64ZVBC32-NEXT:    lui t4, 8
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vxor.vv v13, v13, v15
+; RV64ZVBC32-NEXT:    vmul.vx v15, v9, t3
+; RV64ZVBC32-NEXT:    lui t3, 32
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v11, v13, v11
+; RV64ZVBC32-NEXT:    vmul.vx v13, v9, t4
+; RV64ZVBC32-NEXT:    lui t4, 64
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vxor.vv v11, v11, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v9, t2
+; RV64ZVBC32-NEXT:    lui t2, 128
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vxor.vv v11, v11, v14
+; RV64ZVBC32-NEXT:    vmul.vx v14, v9, t3
+; RV64ZVBC32-NEXT:    lui t3, 256
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v11, v11, v15
+; RV64ZVBC32-NEXT:    vmul.vx v15, v9, t4
+; RV64ZVBC32-NEXT:    lui t4, 512
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vxor.vv v11, v11, v13
+; RV64ZVBC32-NEXT:    vmul.vx v13, v9, t2
+; RV64ZVBC32-NEXT:    lui t2, 1024
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vxor.vv v11, v11, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v9, t3
+; RV64ZVBC32-NEXT:    lui t3, 2048
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v11, v11, v14
+; RV64ZVBC32-NEXT:    vmul.vx v14, v9, t4
+; RV64ZVBC32-NEXT:    lui t4, 4096
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vxor.vv v11, v11, v15
+; RV64ZVBC32-NEXT:    vmul.vx v15, v9, t2
+; RV64ZVBC32-NEXT:    lui t2, 8192
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vxor.vv v11, v11, v13
+; RV64ZVBC32-NEXT:    vmul.vx v13, v9, t3
+; RV64ZVBC32-NEXT:    lui t3, 16384
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v11, v11, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v9, t4
+; RV64ZVBC32-NEXT:    lui t4, 32768
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vxor.vv v11, v11, v14
+; RV64ZVBC32-NEXT:    vmul.vx v14, v9, t2
+; RV64ZVBC32-NEXT:    lui t2, 65536
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vxor.vv v11, v11, v15
+; RV64ZVBC32-NEXT:    vmul.vx v15, v9, t3
+; RV64ZVBC32-NEXT:    lui t3, 131072
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v11, v11, v13
+; RV64ZVBC32-NEXT:    vmul.vx v13, v9, t4
+; RV64ZVBC32-NEXT:    lui t4, 262144
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vxor.vv v12, v11, v12
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vmul.vx v14, v9, t2
+; RV64ZVBC32-NEXT:    slli t2, a0, 32
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vsll.vx v10, v10, a1
+; RV64ZVBC32-NEXT:    vand.vx v11, v11, a6
+; RV64ZVBC32-NEXT:    vsll.vx v11, v11, a7
+; RV64ZVBC32-NEXT:    vor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vmul.vx v11, v9, t3
+; RV64ZVBC32-NEXT:    slli t3, a0, 33
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v15
+; RV64ZVBC32-NEXT:    vmul.vx v15, v9, t4
+; RV64ZVBC32-NEXT:    slli t4, a0, 34
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v13
+; RV64ZVBC32-NEXT:    vmul.vx v13, v9, t2
+; RV64ZVBC32-NEXT:    slli t2, a0, 35
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vmul.vx v14, v9, t3
+; RV64ZVBC32-NEXT:    slli t3, a0, 36
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v11, v12, v11
+; RV64ZVBC32-NEXT:    vmul.vx v12, v9, t4
+; RV64ZVBC32-NEXT:    srliw t4, t1, 31
+; RV64ZVBC32-NEXT:    slli t4, t4, 31
+; RV64ZVBC32-NEXT:    vxor.vv v11, v11, v15
+; RV64ZVBC32-NEXT:    vmul.vx v15, v9, t4
+; RV64ZVBC32-NEXT:    slli t4, a0, 37
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vxor.vv v11, v11, v15
+; RV64ZVBC32-NEXT:    vmul.vx v15, v9, t2
+; RV64ZVBC32-NEXT:    slli t2, a0, 38
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vxor.vv v11, v11, v13
+; RV64ZVBC32-NEXT:    vmul.vx v13, v9, t3
+; RV64ZVBC32-NEXT:    slli t3, a0, 39
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v11, v11, v14
+; RV64ZVBC32-NEXT:    vmul.vx v14, v9, t4
+; RV64ZVBC32-NEXT:    slli t4, a0, 40
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vxor.vv v11, v11, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v9, t2
+; RV64ZVBC32-NEXT:    slli t2, a0, 41
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vxor.vv v11, v11, v15
+; RV64ZVBC32-NEXT:    vmul.vx v15, v9, t3
+; RV64ZVBC32-NEXT:    slli t3, a0, 42
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v11, v11, v13
+; RV64ZVBC32-NEXT:    vmul.vx v13, v9, t4
+; RV64ZVBC32-NEXT:    slli t4, a0, 43
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vxor.vv v11, v11, v14
+; RV64ZVBC32-NEXT:    vmul.vx v14, v9, t2
+; RV64ZVBC32-NEXT:    slli t2, a0, 44
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vxor.vv v11, v11, v12
+; RV64ZVBC32-NEXT:    vmul.vx v12, v9, t3
+; RV64ZVBC32-NEXT:    slli t3, a0, 45
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v11, v11, v15
+; RV64ZVBC32-NEXT:    vmul.vx v15, v9, t4
+; RV64ZVBC32-NEXT:    slli t4, a0, 46
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vxor.vv v13, v11, v13
+; RV64ZVBC32-NEXT:    vxor.vv v13, v13, v14
+; RV64ZVBC32-NEXT:    vmul.vx v14, v9, t2
+; RV64ZVBC32-NEXT:    slli t2, a0, 47
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vxor.vv v12, v13, v12
+; RV64ZVBC32-NEXT:    vmul.vx v13, v9, t3
+; RV64ZVBC32-NEXT:    slli t3, a0, 48
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v15
+; RV64ZVBC32-NEXT:    vmul.vx v15, v9, t4
+; RV64ZVBC32-NEXT:    slli t4, a0, 49
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vmul.vx v14, v9, t2
+; RV64ZVBC32-NEXT:    slli t2, a0, 50
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v13
+; RV64ZVBC32-NEXT:    vmul.vx v13, v9, t3
+; RV64ZVBC32-NEXT:    slli t3, a0, 51
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v15
+; RV64ZVBC32-NEXT:    vmul.vx v15, v9, t4
+; RV64ZVBC32-NEXT:    slli t4, a0, 52
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC32-NEXT:    vmul.vx v14, v9, t2
+; RV64ZVBC32-NEXT:    slli t2, a0, 53
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vxor.vv v13, v12, v13
+; RV64ZVBC32-NEXT:    vxor.vv v13, v13, v15
+; RV64ZVBC32-NEXT:    vmul.vx v15, v9, t3
+; RV64ZVBC32-NEXT:    slli t3, a0, 54
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v13, v13, v14
+; RV64ZVBC32-NEXT:    vsrl.vi v14, v11, 8
+; RV64ZVBC32-NEXT:    vand.vx v14, v14, t0
+; RV64ZVBC32-NEXT:    vsrl.vi v12, v12, 24
+; RV64ZVBC32-NEXT:    vand.vx v12, v12, a2
+; RV64ZVBC32-NEXT:    vor.vv v12, v14, v12
+; RV64ZVBC32-NEXT:    vmul.vx v14, v9, t4
+; RV64ZVBC32-NEXT:    slli t4, a0, 55
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vxor.vv v13, v13, v15
+; RV64ZVBC32-NEXT:    vmul.vx v15, v9, t2
+; RV64ZVBC32-NEXT:    slli t2, a0, 56
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vxor.vv v13, v13, v14
+; RV64ZVBC32-NEXT:    vmul.vx v14, v9, t3
+; RV64ZVBC32-NEXT:    slli t3, a0, 57
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    vxor.vv v13, v13, v15
+; RV64ZVBC32-NEXT:    vmul.vx v15, v9, t4
+; RV64ZVBC32-NEXT:    slli t4, a0, 58
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    vxor.vv v13, v13, v14
+; RV64ZVBC32-NEXT:    vmul.vx v14, v9, t2
+; RV64ZVBC32-NEXT:    slli t2, a0, 59
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    vxor.vv v13, v13, v15
+; RV64ZVBC32-NEXT:    vmul.vx v15, v9, t3
+; RV64ZVBC32-NEXT:    slli t3, a0, 60
+; RV64ZVBC32-NEXT:    vand.vx v11, v11, a2
+; RV64ZVBC32-NEXT:    slli a2, a0, 61
+; RV64ZVBC32-NEXT:    slli a0, a0, 62
+; RV64ZVBC32-NEXT:    and t4, t1, t4
+; RV64ZVBC32-NEXT:    and t2, t1, t2
+; RV64ZVBC32-NEXT:    and t3, t1, t3
+; RV64ZVBC32-NEXT:    and a2, t1, a2
+; RV64ZVBC32-NEXT:    and a0, t1, a0
+; RV64ZVBC32-NEXT:    srli t1, t1, 63
+; RV64ZVBC32-NEXT:    vsll.vi v11, v11, 24
+; RV64ZVBC32-NEXT:    vxor.vv v14, v13, v14
+; RV64ZVBC32-NEXT:    vxor.vv v14, v14, v15
+; RV64ZVBC32-NEXT:    vand.vx v15, v13, t0
+; RV64ZVBC32-NEXT:    vsll.vi v15, v15, 8
+; RV64ZVBC32-NEXT:    vor.vv v11, v11, v15
+; RV64ZVBC32-NEXT:    vmul.vx v15, v9, t4
+; RV64ZVBC32-NEXT:    vxor.vv v14, v14, v15
+; RV64ZVBC32-NEXT:    vmul.vx v15, v9, t2
+; RV64ZVBC32-NEXT:    vor.vv v10, v10, v11
+; RV64ZVBC32-NEXT:    vmul.vx v11, v9, t3
+; RV64ZVBC32-NEXT:    vxor.vv v14, v14, v15
+; RV64ZVBC32-NEXT:    vmul.vx v15, v9, a2
+; RV64ZVBC32-NEXT:    vxor.vv v11, v14, v11
+; RV64ZVBC32-NEXT:    vmul.vx v14, v9, a0
+; RV64ZVBC32-NEXT:    slli t1, t1, 63
+; RV64ZVBC32-NEXT:    vmul.vx v9, v9, t1
+; RV64ZVBC32-NEXT:    vsrl.vx v13, v13, a7
+; RV64ZVBC32-NEXT:    vand.vx v13, v13, a6
+; RV64ZVBC32-NEXT:    vxor.vv v11, v11, v15
+; RV64ZVBC32-NEXT:    vxor.vv v11, v11, v14
+; RV64ZVBC32-NEXT:    vxor.vv v9, v11, v9
+; RV64ZVBC32-NEXT:    vsrl.vx v9, v9, a1
+; RV64ZVBC32-NEXT:    vor.vv v9, v13, v9
+; RV64ZVBC32-NEXT:    vor.vv v9, v12, v9
+; RV64ZVBC32-NEXT:    vor.vv v9, v10, v9
+; RV64ZVBC32-NEXT:    vsrl.vi v10, v9, 4
+; RV64ZVBC32-NEXT:    vand.vx v9, v9, a5
+; RV64ZVBC32-NEXT:    vand.vx v10, v10, a5
+; RV64ZVBC32-NEXT:    vsll.vi v9, v9, 4
+; RV64ZVBC32-NEXT:    vor.vv v9, v10, v9
+; RV64ZVBC32-NEXT:    vsrl.vi v10, v9, 2
+; RV64ZVBC32-NEXT:    vand.vx v9, v9, a4
+; RV64ZVBC32-NEXT:    vand.vx v10, v10, a4
+; RV64ZVBC32-NEXT:    vsll.vi v9, v9, 2
+; RV64ZVBC32-NEXT:    vor.vv v9, v10, v9
+; RV64ZVBC32-NEXT:    vsrl.vi v10, v9, 1
+; RV64ZVBC32-NEXT:    vand.vx v9, v9, a3
+; RV64ZVBC32-NEXT:    vand.vx v10, v10, a3
+; RV64ZVBC32-NEXT:    vadd.vv v9, v9, v9
+; RV64ZVBC32-NEXT:    vor.vv v9, v10, v9
+; RV64ZVBC32-NEXT:    vsrl.vi v8, v9, 1, v0.t
+; RV64ZVBC32-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x i64> poison, i64 %b, i128 0
   %vb = shufflevector <vscale x 1 x i64> %elt.head, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
   %va.ext = zext <vscale x 1 x i64> %va to <vscale x 1 x i128>
@@ -46802,3 +86106,6 @@ define <vscale x 1 x i64> @clmulh_nxv1i64_vx_mask(<vscale x 1 x i64> %va, i64 %b
   %sel = select <vscale x 1 x i1> %mask, <vscale x 1 x i64> %res, <vscale x 1 x i64> %va
   ret <vscale x 1 x i64> %sel
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; RV32ZVBC: {{.*}}
+; RV64ZVBC: {{.*}}

>From 6799fcb95bd9539da7309a513491d19bb564ec48 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper at sifive.com>
Date: Tue, 3 Mar 2026 15:44:51 -0800
Subject: [PATCH 2/3] [RISCV] Lower i8/i16/i32 scalable vector
 ISD::CLMUL/CLMULH with Zvbc32e.

---
 llvm/lib/Target/RISCV/RISCVFeatures.td       |     2 +
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp  |    11 +-
 llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td   |    14 +-
 llvm/test/CodeGen/RISCV/rvv/clmul-sdnode.ll  | 13241 ++++---
 llvm/test/CodeGen/RISCV/rvv/clmulh-sdnode.ll | 31490 ++++++++++-------
 5 files changed, 25135 insertions(+), 19623 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index b79555a0a6f72..99706970539ec 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -776,6 +776,8 @@ def FeatureStdExtZvbc32e
                                  "Vector Carryless Multiplication with 32-bits elements",
                                  [FeatureStdExtZve32x]>;
 
+def HasStdExtZvbc32e : Predicate<"Subtarget->hasStdExtZvbc32e()">;
+
 def HasStdExtZvbcOrZvbc32e : Predicate<"Subtarget->hasStdExtZvbc() || Subtarget->hasStdExtZvbc32e()">,
                              AssemblerPredicate<(any_of FeatureStdExtZvbc, FeatureStdExtZvbc32e),
                              "'Zvbc' or 'Zvbc32e' (Vector Carryless Multiplication)">;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 7e62957529fea..768dac1eaa373 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1117,17 +1117,20 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
         }
       }
 
-      if (Subtarget.hasStdExtZvbc() && Subtarget.hasVInstructionsI64()) {
-        // TODO: Support Zvbc32e.
-        if (VT.getVectorElementType() == MVT::i64)
+      if (VT.getVectorElementType() == MVT::i64) {
+        if (Subtarget.hasStdExtZvbc())
           setOperationAction({ISD::CLMUL, ISD::CLMULH}, VT, Legal);
-        else {
+      } else {
+        if (Subtarget.hasStdExtZvbc32e()) {
+          setOperationAction({ISD::CLMUL, ISD::CLMULH}, VT, Legal);
+        } else if (Subtarget.hasStdExtZvbc() && Subtarget.hasVInstructionsI64()) {
           // Promote to i64 if the lmul is small enough.
           // FIXME: Split if necessary to widen.
           // FIXME: Promote clmulh directly without legalizing to clmul first.
           MVT I64VecVT = MVT::getVectorVT(MVT::i64, VT.getVectorElementCount());
           if (isTypeLegal(I64VecVT))
             setOperationAction(ISD::CLMUL, VT, Custom);
+
         }
       }
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
index 5392fc4a4aa64..58ce46d3eba4a 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
@@ -527,7 +527,7 @@ let Predicates = [HasStdExtZvbb] in {
   defm PseudoVWSLL : VPseudoVWSLL;
 } // Predicates = [HasStdExtZvbb]
 
-let Predicates = [HasStdExtZvbc] in {
+let Predicates = [HasStdExtZvbcOrZvbc32e] in {
   defm PseudoVCLMUL  : VPseudoVCLMUL_VV_VX;
   defm PseudoVCLMULH : VPseudoVCLMUL_VV_VX;
 } // Predicates = [HasStdExtZvbc]
@@ -716,8 +716,16 @@ foreach vtiToWti = AllWidenableIntVectors in {
   }
 }
 
-defm : VPatBinarySDNode_VV_VX<clmul, "PseudoVCLMUL", I64IntegerVectors, ExtraPreds=[HasStdExtZvbc]>;
-defm : VPatBinarySDNode_VV_VX<clmulh, "PseudoVCLMULH", I64IntegerVectors, ExtraPreds=[HasStdExtZvbc]>;
+defm : VPatBinarySDNode_VV_VX<clmul, "PseudoVCLMUL", I64IntegerVectors,
+                              ExtraPreds=[HasStdExtZvbc]>;
+defm : VPatBinarySDNode_VV_VX<clmulh, "PseudoVCLMULH", I64IntegerVectors,
+                              ExtraPreds=[HasStdExtZvbc]>;
+
+defvar NonI64IntegerVectors = !filter(vti, AllIntegerVectors, !le(vti.SEW, 32));
+defm : VPatBinarySDNode_VV_VX<clmul, "PseudoVCLMUL", NonI64IntegerVectors,
+                              ExtraPreds=[HasStdExtZvbc32e]>;
+defm : VPatBinarySDNode_VV_VX<clmulh, "PseudoVCLMULH", NonI64IntegerVectors,
+                              ExtraPreds=[HasStdExtZvbc32e]>;
 
 //===----------------------------------------------------------------------===//
 // VL patterns
diff --git a/llvm/test/CodeGen/RISCV/rvv/clmul-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/clmul-sdnode.ll
index 0ab98e756c6ed..4a7c373a0303d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/clmul-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/clmul-sdnode.ll
@@ -102,65 +102,13 @@ define <vscale x 1 x i8> @clmul_nxv1i8_vv(<vscale x 1 x i8> %va, <vscale x 1 x i
 ; RV32ZVBC32-LABEL: clmul_nxv1i8_vv:
 ; RV32ZVBC32:       # %bb.0:
 ; RV32ZVBC32-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
-; RV32ZVBC32-NEXT:    vand.vi v10, v9, 2
-; RV32ZVBC32-NEXT:    vand.vi v11, v9, 1
-; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v10
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v11, v10
-; RV32ZVBC32-NEXT:    vand.vi v11, v9, 4
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vi v11, v9, 8
-; RV32ZVBC32-NEXT:    li a0, 16
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 32
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 64
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 128
-; RV32ZVBC32-NEXT:    vand.vx v9, v9, a0
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vmul.vv v8, v8, v9
-; RV32ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    vclmul.vv v8, v8, v9
 ; RV32ZVBC32-NEXT:    ret
 ;
 ; RV64ZVBC32-LABEL: clmul_nxv1i8_vv:
 ; RV64ZVBC32:       # %bb.0:
 ; RV64ZVBC32-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
-; RV64ZVBC32-NEXT:    vand.vi v10, v9, 2
-; RV64ZVBC32-NEXT:    vand.vi v11, v9, 1
-; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v11, v10
-; RV64ZVBC32-NEXT:    vand.vi v11, v9, 4
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vi v11, v9, 8
-; RV64ZVBC32-NEXT:    li a0, 16
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 32
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 64
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 128
-; RV64ZVBC32-NEXT:    vand.vx v9, v9, a0
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vmul.vv v8, v8, v9
-; RV64ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    vclmul.vv v8, v8, v9
 ; RV64ZVBC32-NEXT:    ret
   %v = call <vscale x 1 x i8> @llvm.clmul.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %vb)
   ret <vscale x 1 x i8> %v
@@ -268,67 +216,13 @@ define <vscale x 1 x i8> @clmul_nxv1i8_vx(<vscale x 1 x i8> %va, i8 %b) nounwind
 ; RV32ZVBC32-LABEL: clmul_nxv1i8_vx:
 ; RV32ZVBC32:       # %bb.0:
 ; RV32ZVBC32-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
-; RV32ZVBC32-NEXT:    vmv.v.x v9, a0
-; RV32ZVBC32-NEXT:    li a0, 16
-; RV32ZVBC32-NEXT:    vand.vi v10, v9, 2
-; RV32ZVBC32-NEXT:    vand.vi v11, v9, 1
-; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v10
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v11, v10
-; RV32ZVBC32-NEXT:    vand.vi v11, v9, 4
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vi v11, v9, 8
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 32
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 64
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 128
-; RV32ZVBC32-NEXT:    vand.vx v9, v9, a0
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vmul.vv v8, v8, v9
-; RV32ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    vclmul.vx v8, v8, a0
 ; RV32ZVBC32-NEXT:    ret
 ;
 ; RV64ZVBC32-LABEL: clmul_nxv1i8_vx:
 ; RV64ZVBC32:       # %bb.0:
 ; RV64ZVBC32-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
-; RV64ZVBC32-NEXT:    vmv.v.x v9, a0
-; RV64ZVBC32-NEXT:    li a0, 16
-; RV64ZVBC32-NEXT:    vand.vi v10, v9, 2
-; RV64ZVBC32-NEXT:    vand.vi v11, v9, 1
-; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v11, v10
-; RV64ZVBC32-NEXT:    vand.vi v11, v9, 4
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vi v11, v9, 8
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 32
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 64
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 128
-; RV64ZVBC32-NEXT:    vand.vx v9, v9, a0
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vmul.vv v8, v8, v9
-; RV64ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    vclmul.vx v8, v8, a0
 ; RV64ZVBC32-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x i8> poison, i8 %b, i32 0
   %vb = shufflevector <vscale x 1 x i8> %elt.head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
@@ -432,65 +326,13 @@ define <vscale x 2 x i8> @clmul_nxv2i8_vv(<vscale x 2 x i8> %va, <vscale x 2 x i
 ; RV32ZVBC32-LABEL: clmul_nxv2i8_vv:
 ; RV32ZVBC32:       # %bb.0:
 ; RV32ZVBC32-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
-; RV32ZVBC32-NEXT:    vand.vi v10, v9, 2
-; RV32ZVBC32-NEXT:    vand.vi v11, v9, 1
-; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v10
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v11, v10
-; RV32ZVBC32-NEXT:    vand.vi v11, v9, 4
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vi v11, v9, 8
-; RV32ZVBC32-NEXT:    li a0, 16
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 32
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 64
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 128
-; RV32ZVBC32-NEXT:    vand.vx v9, v9, a0
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vmul.vv v8, v8, v9
-; RV32ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    vclmul.vv v8, v8, v9
 ; RV32ZVBC32-NEXT:    ret
 ;
 ; RV64ZVBC32-LABEL: clmul_nxv2i8_vv:
 ; RV64ZVBC32:       # %bb.0:
 ; RV64ZVBC32-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
-; RV64ZVBC32-NEXT:    vand.vi v10, v9, 2
-; RV64ZVBC32-NEXT:    vand.vi v11, v9, 1
-; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v11, v10
-; RV64ZVBC32-NEXT:    vand.vi v11, v9, 4
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vi v11, v9, 8
-; RV64ZVBC32-NEXT:    li a0, 16
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 32
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 64
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 128
-; RV64ZVBC32-NEXT:    vand.vx v9, v9, a0
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vmul.vv v8, v8, v9
-; RV64ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    vclmul.vv v8, v8, v9
 ; RV64ZVBC32-NEXT:    ret
   %v = call <vscale x 2 x i8> @llvm.clmul.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %vb)
   ret <vscale x 2 x i8> %v
@@ -598,67 +440,13 @@ define <vscale x 2 x i8> @clmul_nxv2i8_vx(<vscale x 2 x i8> %va, i8 %b) nounwind
 ; RV32ZVBC32-LABEL: clmul_nxv2i8_vx:
 ; RV32ZVBC32:       # %bb.0:
 ; RV32ZVBC32-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
-; RV32ZVBC32-NEXT:    vmv.v.x v9, a0
-; RV32ZVBC32-NEXT:    li a0, 16
-; RV32ZVBC32-NEXT:    vand.vi v10, v9, 2
-; RV32ZVBC32-NEXT:    vand.vi v11, v9, 1
-; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v10
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v11, v10
-; RV32ZVBC32-NEXT:    vand.vi v11, v9, 4
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vi v11, v9, 8
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 32
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 64
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 128
-; RV32ZVBC32-NEXT:    vand.vx v9, v9, a0
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vmul.vv v8, v8, v9
-; RV32ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    vclmul.vx v8, v8, a0
 ; RV32ZVBC32-NEXT:    ret
 ;
 ; RV64ZVBC32-LABEL: clmul_nxv2i8_vx:
 ; RV64ZVBC32:       # %bb.0:
 ; RV64ZVBC32-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
-; RV64ZVBC32-NEXT:    vmv.v.x v9, a0
-; RV64ZVBC32-NEXT:    li a0, 16
-; RV64ZVBC32-NEXT:    vand.vi v10, v9, 2
-; RV64ZVBC32-NEXT:    vand.vi v11, v9, 1
-; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v11, v10
-; RV64ZVBC32-NEXT:    vand.vi v11, v9, 4
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vi v11, v9, 8
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 32
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 64
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 128
-; RV64ZVBC32-NEXT:    vand.vx v9, v9, a0
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vmul.vv v8, v8, v9
-; RV64ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    vclmul.vx v8, v8, a0
 ; RV64ZVBC32-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x i8> poison, i8 %b, i32 0
   %vb = shufflevector <vscale x 2 x i8> %elt.head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
@@ -762,65 +550,13 @@ define <vscale x 4 x i8> @clmul_nxv4i8_vv(<vscale x 4 x i8> %va, <vscale x 4 x i
 ; RV32ZVBC32-LABEL: clmul_nxv4i8_vv:
 ; RV32ZVBC32:       # %bb.0:
 ; RV32ZVBC32-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
-; RV32ZVBC32-NEXT:    vand.vi v10, v9, 2
-; RV32ZVBC32-NEXT:    vand.vi v11, v9, 1
-; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v10
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v11, v10
-; RV32ZVBC32-NEXT:    vand.vi v11, v9, 4
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vi v11, v9, 8
-; RV32ZVBC32-NEXT:    li a0, 16
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 32
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 64
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 128
-; RV32ZVBC32-NEXT:    vand.vx v9, v9, a0
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vmul.vv v8, v8, v9
-; RV32ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    vclmul.vv v8, v8, v9
 ; RV32ZVBC32-NEXT:    ret
 ;
 ; RV64ZVBC32-LABEL: clmul_nxv4i8_vv:
 ; RV64ZVBC32:       # %bb.0:
 ; RV64ZVBC32-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
-; RV64ZVBC32-NEXT:    vand.vi v10, v9, 2
-; RV64ZVBC32-NEXT:    vand.vi v11, v9, 1
-; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v11, v10
-; RV64ZVBC32-NEXT:    vand.vi v11, v9, 4
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vi v11, v9, 8
-; RV64ZVBC32-NEXT:    li a0, 16
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 32
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 64
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 128
-; RV64ZVBC32-NEXT:    vand.vx v9, v9, a0
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vmul.vv v8, v8, v9
-; RV64ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    vclmul.vv v8, v8, v9
 ; RV64ZVBC32-NEXT:    ret
   %v = call <vscale x 4 x i8> @llvm.clmul.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %vb)
   ret <vscale x 4 x i8> %v
@@ -928,67 +664,13 @@ define <vscale x 4 x i8> @clmul_nxv4i8_vx(<vscale x 4 x i8> %va, i8 %b) nounwind
 ; RV32ZVBC32-LABEL: clmul_nxv4i8_vx:
 ; RV32ZVBC32:       # %bb.0:
 ; RV32ZVBC32-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
-; RV32ZVBC32-NEXT:    vmv.v.x v9, a0
-; RV32ZVBC32-NEXT:    li a0, 16
-; RV32ZVBC32-NEXT:    vand.vi v10, v9, 2
-; RV32ZVBC32-NEXT:    vand.vi v11, v9, 1
-; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v10
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v11, v10
-; RV32ZVBC32-NEXT:    vand.vi v11, v9, 4
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vi v11, v9, 8
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 32
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 64
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 128
-; RV32ZVBC32-NEXT:    vand.vx v9, v9, a0
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vmul.vv v8, v8, v9
-; RV32ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    vclmul.vx v8, v8, a0
 ; RV32ZVBC32-NEXT:    ret
 ;
 ; RV64ZVBC32-LABEL: clmul_nxv4i8_vx:
 ; RV64ZVBC32:       # %bb.0:
 ; RV64ZVBC32-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
-; RV64ZVBC32-NEXT:    vmv.v.x v9, a0
-; RV64ZVBC32-NEXT:    li a0, 16
-; RV64ZVBC32-NEXT:    vand.vi v10, v9, 2
-; RV64ZVBC32-NEXT:    vand.vi v11, v9, 1
-; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v11, v10
-; RV64ZVBC32-NEXT:    vand.vi v11, v9, 4
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vi v11, v9, 8
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 32
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 64
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 128
-; RV64ZVBC32-NEXT:    vand.vx v9, v9, a0
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vmul.vv v8, v8, v9
-; RV64ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    vclmul.vx v8, v8, a0
 ; RV64ZVBC32-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x i8> poison, i8 %b, i32 0
   %vb = shufflevector <vscale x 4 x i8> %elt.head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
@@ -1092,65 +774,13 @@ define <vscale x 8 x i8> @clmul_nxv8i8_vv(<vscale x 8 x i8> %va, <vscale x 8 x i
 ; RV32ZVBC32-LABEL: clmul_nxv8i8_vv:
 ; RV32ZVBC32:       # %bb.0:
 ; RV32ZVBC32-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
-; RV32ZVBC32-NEXT:    vand.vi v10, v9, 2
-; RV32ZVBC32-NEXT:    vand.vi v11, v9, 1
-; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v10
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v11, v10
-; RV32ZVBC32-NEXT:    vand.vi v11, v9, 4
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vi v11, v9, 8
-; RV32ZVBC32-NEXT:    li a0, 16
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 32
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 64
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 128
-; RV32ZVBC32-NEXT:    vand.vx v9, v9, a0
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vmul.vv v8, v8, v9
-; RV32ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    vclmul.vv v8, v8, v9
 ; RV32ZVBC32-NEXT:    ret
 ;
 ; RV64ZVBC32-LABEL: clmul_nxv8i8_vv:
 ; RV64ZVBC32:       # %bb.0:
 ; RV64ZVBC32-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
-; RV64ZVBC32-NEXT:    vand.vi v10, v9, 2
-; RV64ZVBC32-NEXT:    vand.vi v11, v9, 1
-; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v11, v10
-; RV64ZVBC32-NEXT:    vand.vi v11, v9, 4
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vi v11, v9, 8
-; RV64ZVBC32-NEXT:    li a0, 16
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 32
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 64
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 128
-; RV64ZVBC32-NEXT:    vand.vx v9, v9, a0
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vmul.vv v8, v8, v9
-; RV64ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    vclmul.vv v8, v8, v9
 ; RV64ZVBC32-NEXT:    ret
   %v = call <vscale x 8 x i8> @llvm.clmul.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %vb)
   ret <vscale x 8 x i8> %v
@@ -1258,67 +888,13 @@ define <vscale x 8 x i8> @clmul_nxv8i8_vx(<vscale x 8 x i8> %va, i8 %b) nounwind
 ; RV32ZVBC32-LABEL: clmul_nxv8i8_vx:
 ; RV32ZVBC32:       # %bb.0:
 ; RV32ZVBC32-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
-; RV32ZVBC32-NEXT:    vmv.v.x v9, a0
-; RV32ZVBC32-NEXT:    li a0, 16
-; RV32ZVBC32-NEXT:    vand.vi v10, v9, 2
-; RV32ZVBC32-NEXT:    vand.vi v11, v9, 1
-; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v10
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v11, v10
-; RV32ZVBC32-NEXT:    vand.vi v11, v9, 4
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vi v11, v9, 8
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 32
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 64
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 128
-; RV32ZVBC32-NEXT:    vand.vx v9, v9, a0
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vmul.vv v8, v8, v9
-; RV32ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    vclmul.vx v8, v8, a0
 ; RV32ZVBC32-NEXT:    ret
 ;
 ; RV64ZVBC32-LABEL: clmul_nxv8i8_vx:
 ; RV64ZVBC32:       # %bb.0:
 ; RV64ZVBC32-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
-; RV64ZVBC32-NEXT:    vmv.v.x v9, a0
-; RV64ZVBC32-NEXT:    li a0, 16
-; RV64ZVBC32-NEXT:    vand.vi v10, v9, 2
-; RV64ZVBC32-NEXT:    vand.vi v11, v9, 1
-; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v11, v10
-; RV64ZVBC32-NEXT:    vand.vi v11, v9, 4
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vi v11, v9, 8
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 32
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 64
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 128
-; RV64ZVBC32-NEXT:    vand.vx v9, v9, a0
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vmul.vv v8, v8, v9
-; RV64ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    vclmul.vx v8, v8, a0
 ; RV64ZVBC32-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x i8> poison, i8 %b, i32 0
   %vb = shufflevector <vscale x 8 x i8> %elt.head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
@@ -1327,1696 +903,891 @@ define <vscale x 8 x i8> @clmul_nxv8i8_vx(<vscale x 8 x i8> %va, i8 %b) nounwind
 }
 
 define <vscale x 16 x i8> @clmul_nxv16i8_vv(<vscale x 16 x i8> %va, <vscale x 16 x i8> %vb) nounwind {
-; CHECK-LABEL: clmul_nxv16i8_vv:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vand.vi v12, v10, 2
-; CHECK-NEXT:    vand.vi v14, v10, 1
-; CHECK-NEXT:    vmul.vv v12, v8, v12
-; CHECK-NEXT:    vmul.vv v14, v8, v14
-; CHECK-NEXT:    vxor.vv v12, v14, v12
-; CHECK-NEXT:    vand.vi v14, v10, 4
-; CHECK-NEXT:    vmul.vv v14, v8, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vi v14, v10, 8
-; CHECK-NEXT:    li a0, 16
-; CHECK-NEXT:    vmul.vv v14, v8, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vx v14, v10, a0
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vmul.vv v14, v8, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vx v14, v10, a0
-; CHECK-NEXT:    li a0, 64
-; CHECK-NEXT:    vmul.vv v14, v8, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vx v14, v10, a0
-; CHECK-NEXT:    li a0, 128
-; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    vmul.vv v14, v8, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vmul.vv v8, v8, v10
-; CHECK-NEXT:    vxor.vv v8, v12, v8
-; CHECK-NEXT:    ret
-  %v = call <vscale x 16 x i8> @llvm.clmul.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %vb)
-  ret <vscale x 16 x i8> %v
-}
-
-define <vscale x 16 x i8> @clmul_nxv16i8_vx(<vscale x 16 x i8> %va, i8 %b) nounwind {
-; CHECK-LABEL: clmul_nxv16i8_vx:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmv.v.x v10, a0
-; CHECK-NEXT:    li a0, 16
-; CHECK-NEXT:    vand.vi v12, v10, 2
-; CHECK-NEXT:    vand.vi v14, v10, 1
-; CHECK-NEXT:    vmul.vv v12, v8, v12
-; CHECK-NEXT:    vmul.vv v14, v8, v14
-; CHECK-NEXT:    vxor.vv v12, v14, v12
-; CHECK-NEXT:    vand.vi v14, v10, 4
-; CHECK-NEXT:    vmul.vv v14, v8, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vi v14, v10, 8
-; CHECK-NEXT:    vmul.vv v14, v8, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vx v14, v10, a0
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vmul.vv v14, v8, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vx v14, v10, a0
-; CHECK-NEXT:    li a0, 64
-; CHECK-NEXT:    vmul.vv v14, v8, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vx v14, v10, a0
-; CHECK-NEXT:    li a0, 128
-; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    vmul.vv v14, v8, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vmul.vv v8, v8, v10
-; CHECK-NEXT:    vxor.vv v8, v12, v8
-; CHECK-NEXT:    ret
-  %elt.head = insertelement <vscale x 16 x i8> poison, i8 %b, i32 0
-  %vb = shufflevector <vscale x 16 x i8> %elt.head, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
-  %v = call <vscale x 16 x i8> @llvm.clmul.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %vb)
-  ret <vscale x 16 x i8> %v
-}
-
-define <vscale x 32 x i8> @clmul_nxv32i8_vv(<vscale x 32 x i8> %va, <vscale x 32 x i8> %vb) nounwind {
-; CHECK-LABEL: clmul_nxv32i8_vv:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vand.vi v16, v12, 2
-; CHECK-NEXT:    vand.vi v20, v12, 1
-; CHECK-NEXT:    vmul.vv v16, v8, v16
-; CHECK-NEXT:    vmul.vv v20, v8, v20
-; CHECK-NEXT:    vxor.vv v16, v20, v16
-; CHECK-NEXT:    vand.vi v20, v12, 4
-; CHECK-NEXT:    vmul.vv v20, v8, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vi v20, v12, 8
-; CHECK-NEXT:    li a0, 16
-; CHECK-NEXT:    vmul.vv v20, v8, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vx v20, v12, a0
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vmul.vv v20, v8, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vx v20, v12, a0
-; CHECK-NEXT:    li a0, 64
-; CHECK-NEXT:    vmul.vv v20, v8, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vx v20, v12, a0
-; CHECK-NEXT:    li a0, 128
-; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vmul.vv v20, v8, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vmul.vv v8, v8, v12
-; CHECK-NEXT:    vxor.vv v8, v16, v8
-; CHECK-NEXT:    ret
-  %v = call <vscale x 32 x i8> @llvm.clmul.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %vb)
-  ret <vscale x 32 x i8> %v
-}
-
-define <vscale x 32 x i8> @clmul_nxv32i8_vx(<vscale x 32 x i8> %va, i8 %b) nounwind {
-; CHECK-LABEL: clmul_nxv32i8_vx:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vmv.v.x v12, a0
-; CHECK-NEXT:    li a0, 16
-; CHECK-NEXT:    vand.vi v16, v12, 2
-; CHECK-NEXT:    vand.vi v20, v12, 1
-; CHECK-NEXT:    vmul.vv v16, v8, v16
-; CHECK-NEXT:    vmul.vv v20, v8, v20
-; CHECK-NEXT:    vxor.vv v16, v20, v16
-; CHECK-NEXT:    vand.vi v20, v12, 4
-; CHECK-NEXT:    vmul.vv v20, v8, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vi v20, v12, 8
-; CHECK-NEXT:    vmul.vv v20, v8, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vx v20, v12, a0
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vmul.vv v20, v8, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vx v20, v12, a0
-; CHECK-NEXT:    li a0, 64
-; CHECK-NEXT:    vmul.vv v20, v8, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vx v20, v12, a0
-; CHECK-NEXT:    li a0, 128
-; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vmul.vv v20, v8, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vmul.vv v8, v8, v12
-; CHECK-NEXT:    vxor.vv v8, v16, v8
-; CHECK-NEXT:    ret
-  %elt.head = insertelement <vscale x 32 x i8> poison, i8 %b, i32 0
-  %vb = shufflevector <vscale x 32 x i8> %elt.head, <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer
-  %v = call <vscale x 32 x i8> @llvm.clmul.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %vb)
-  ret <vscale x 32 x i8> %v
-}
-
-define <vscale x 64 x i8> @clmul_nxv64i8_vv(<vscale x 64 x i8> %va, <vscale x 64 x i8> %vb) nounwind {
-; CHECK-LABEL: clmul_nxv64i8_vv:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
-; CHECK-NEXT:    vand.vi v24, v16, 2
-; CHECK-NEXT:    vand.vi v0, v16, 1
-; CHECK-NEXT:    vmul.vv v24, v8, v24
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v0, v24
-; CHECK-NEXT:    vand.vi v0, v16, 4
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vi v0, v16, 8
-; CHECK-NEXT:    li a0, 16
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    li a0, 64
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    li a0, 128
-; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vmul.vv v8, v8, v16
-; CHECK-NEXT:    vxor.vv v8, v24, v8
-; CHECK-NEXT:    ret
-  %v = call <vscale x 64 x i8> @llvm.clmul.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %vb)
-  ret <vscale x 64 x i8> %v
-}
-
-define <vscale x 64 x i8> @clmul_nxv64i8_vx(<vscale x 64 x i8> %va, i8 %b) nounwind {
-; CHECK-LABEL: clmul_nxv64i8_vx:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
-; CHECK-NEXT:    vmv.v.x v16, a0
-; CHECK-NEXT:    li a0, 16
-; CHECK-NEXT:    vand.vi v24, v16, 2
-; CHECK-NEXT:    vand.vi v0, v16, 1
-; CHECK-NEXT:    vmul.vv v24, v8, v24
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v0, v24
-; CHECK-NEXT:    vand.vi v0, v16, 4
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vi v0, v16, 8
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    li a0, 64
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    li a0, 128
-; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vmul.vv v8, v8, v16
-; CHECK-NEXT:    vxor.vv v8, v24, v8
-; CHECK-NEXT:    ret
-  %elt.head = insertelement <vscale x 64 x i8> poison, i8 %b, i32 0
-  %vb = shufflevector <vscale x 64 x i8> %elt.head, <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer
-  %v = call <vscale x 64 x i8> @llvm.clmul.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %vb)
-  ret <vscale x 64 x i8> %v
-}
-
-define <vscale x 1 x i16> @clmul_nxv1i16_vv(<vscale x 1 x i16> %va, <vscale x 1 x i16> %vb) nounwind {
-; RV32V-LABEL: clmul_nxv1i16_vv:
+; RV32V-LABEL: clmul_nxv16i8_vv:
 ; RV32V:       # %bb.0:
-; RV32V-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; RV32V-NEXT:    vand.vi v10, v9, 2
-; RV32V-NEXT:    vand.vi v11, v9, 1
-; RV32V-NEXT:    vmul.vv v10, v8, v10
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v11, v10
-; RV32V-NEXT:    vand.vi v11, v9, 4
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vi v11, v9, 8
+; RV32V-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; RV32V-NEXT:    vand.vi v12, v10, 2
+; RV32V-NEXT:    vand.vi v14, v10, 1
+; RV32V-NEXT:    vmul.vv v12, v8, v12
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v14, v12
+; RV32V-NEXT:    vand.vi v14, v10, 4
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vi v14, v10, 8
 ; RV32V-NEXT:    li a0, 16
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
 ; RV32V-NEXT:    li a0, 32
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
 ; RV32V-NEXT:    li a0, 64
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
 ; RV32V-NEXT:    li a0, 128
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
-; RV32V-NEXT:    li a0, 256
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
-; RV32V-NEXT:    li a0, 512
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
-; RV32V-NEXT:    li a0, 1024
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
-; RV32V-NEXT:    li a0, 1
-; RV32V-NEXT:    slli a0, a0, 11
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
-; RV32V-NEXT:    lui a0, 1
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
-; RV32V-NEXT:    lui a0, 2
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
-; RV32V-NEXT:    lui a0, 4
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
-; RV32V-NEXT:    lui a0, 8
-; RV32V-NEXT:    vand.vx v9, v9, a0
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vmul.vv v8, v8, v9
-; RV32V-NEXT:    vxor.vv v8, v10, v8
+; RV32V-NEXT:    vand.vx v10, v10, a0
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vmul.vv v8, v8, v10
+; RV32V-NEXT:    vxor.vv v8, v12, v8
 ; RV32V-NEXT:    ret
 ;
-; RV64V-LABEL: clmul_nxv1i16_vv:
+; RV64V-LABEL: clmul_nxv16i8_vv:
 ; RV64V:       # %bb.0:
-; RV64V-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; RV64V-NEXT:    vand.vi v10, v9, 2
-; RV64V-NEXT:    vand.vi v11, v9, 1
-; RV64V-NEXT:    vmul.vv v10, v8, v10
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v11, v10
-; RV64V-NEXT:    vand.vi v11, v9, 4
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vi v11, v9, 8
+; RV64V-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; RV64V-NEXT:    vand.vi v12, v10, 2
+; RV64V-NEXT:    vand.vi v14, v10, 1
+; RV64V-NEXT:    vmul.vv v12, v8, v12
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v14, v12
+; RV64V-NEXT:    vand.vi v14, v10, 4
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vi v14, v10, 8
 ; RV64V-NEXT:    li a0, 16
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
 ; RV64V-NEXT:    li a0, 32
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
 ; RV64V-NEXT:    li a0, 64
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
 ; RV64V-NEXT:    li a0, 128
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    li a0, 256
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    li a0, 512
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    li a0, 1024
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    li a0, 1
-; RV64V-NEXT:    slli a0, a0, 11
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 1
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 2
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 4
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 8
-; RV64V-NEXT:    vand.vx v9, v9, a0
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vmul.vv v8, v8, v9
-; RV64V-NEXT:    vxor.vv v8, v10, v8
+; RV64V-NEXT:    vand.vx v10, v10, a0
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vmul.vv v8, v8, v10
+; RV64V-NEXT:    vxor.vv v8, v12, v8
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC64-LABEL: clmul_nxv1i16_vv:
+; RV32ZVBC64-LABEL: clmul_nxv16i8_vv:
 ; RV32ZVBC64:       # %bb.0:
-; RV32ZVBC64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV32ZVBC64-NEXT:    vzext.vf4 v10, v9
-; RV32ZVBC64-NEXT:    vzext.vf4 v9, v8
-; RV32ZVBC64-NEXT:    vclmul.vv v8, v9, v10
-; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; RV32ZVBC64-NEXT:    vnsrl.wi v8, v8, 0
-; RV32ZVBC64-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; RV32ZVBC64-NEXT:    vnsrl.wi v8, v8, 0
+; RV32ZVBC64-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; RV32ZVBC64-NEXT:    vand.vi v12, v10, 2
+; RV32ZVBC64-NEXT:    vand.vi v14, v10, 1
+; RV32ZVBC64-NEXT:    vmul.vv v12, v8, v12
+; RV32ZVBC64-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC64-NEXT:    vxor.vv v12, v14, v12
+; RV32ZVBC64-NEXT:    vand.vi v14, v10, 4
+; RV32ZVBC64-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC64-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC64-NEXT:    vand.vi v14, v10, 8
+; RV32ZVBC64-NEXT:    li a0, 16
+; RV32ZVBC64-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC64-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC64-NEXT:    vand.vx v14, v10, a0
+; RV32ZVBC64-NEXT:    li a0, 32
+; RV32ZVBC64-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC64-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC64-NEXT:    vand.vx v14, v10, a0
+; RV32ZVBC64-NEXT:    li a0, 64
+; RV32ZVBC64-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC64-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC64-NEXT:    vand.vx v14, v10, a0
+; RV32ZVBC64-NEXT:    li a0, 128
+; RV32ZVBC64-NEXT:    vand.vx v10, v10, a0
+; RV32ZVBC64-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC64-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC64-NEXT:    vmul.vv v8, v8, v10
+; RV32ZVBC64-NEXT:    vxor.vv v8, v12, v8
 ; RV32ZVBC64-NEXT:    ret
 ;
-; RV64ZVBC64-LABEL: clmul_nxv1i16_vv:
+; RV64ZVBC64-LABEL: clmul_nxv16i8_vv:
 ; RV64ZVBC64:       # %bb.0:
-; RV64ZVBC64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV64ZVBC64-NEXT:    vzext.vf4 v10, v9
-; RV64ZVBC64-NEXT:    vzext.vf4 v9, v8
-; RV64ZVBC64-NEXT:    vclmul.vv v8, v9, v10
-; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; RV64ZVBC64-NEXT:    vnsrl.wi v8, v8, 0
-; RV64ZVBC64-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; RV64ZVBC64-NEXT:    vnsrl.wi v8, v8, 0
+; RV64ZVBC64-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; RV64ZVBC64-NEXT:    vand.vi v12, v10, 2
+; RV64ZVBC64-NEXT:    vand.vi v14, v10, 1
+; RV64ZVBC64-NEXT:    vmul.vv v12, v8, v12
+; RV64ZVBC64-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC64-NEXT:    vxor.vv v12, v14, v12
+; RV64ZVBC64-NEXT:    vand.vi v14, v10, 4
+; RV64ZVBC64-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC64-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC64-NEXT:    vand.vi v14, v10, 8
+; RV64ZVBC64-NEXT:    li a0, 16
+; RV64ZVBC64-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC64-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC64-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC64-NEXT:    li a0, 32
+; RV64ZVBC64-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC64-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC64-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC64-NEXT:    li a0, 64
+; RV64ZVBC64-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC64-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC64-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC64-NEXT:    li a0, 128
+; RV64ZVBC64-NEXT:    vand.vx v10, v10, a0
+; RV64ZVBC64-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC64-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC64-NEXT:    vmul.vv v8, v8, v10
+; RV64ZVBC64-NEXT:    vxor.vv v8, v12, v8
 ; RV64ZVBC64-NEXT:    ret
 ;
-; RV32ZVBC32-LABEL: clmul_nxv1i16_vv:
+; RV32ZVBC32-LABEL: clmul_nxv16i8_vv:
 ; RV32ZVBC32:       # %bb.0:
-; RV32ZVBC32-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; RV32ZVBC32-NEXT:    vand.vi v10, v9, 2
-; RV32ZVBC32-NEXT:    vand.vi v11, v9, 1
-; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v10
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v11, v10
-; RV32ZVBC32-NEXT:    vand.vi v11, v9, 4
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vi v11, v9, 8
-; RV32ZVBC32-NEXT:    li a0, 16
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 32
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 64
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 128
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 256
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 512
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 1024
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 1
-; RV32ZVBC32-NEXT:    slli a0, a0, 11
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    lui a0, 1
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    lui a0, 2
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    lui a0, 4
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    lui a0, 8
-; RV32ZVBC32-NEXT:    vand.vx v9, v9, a0
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vmul.vv v8, v8, v9
-; RV32ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; RV32ZVBC32-NEXT:    vclmul.vv v8, v8, v10
 ; RV32ZVBC32-NEXT:    ret
 ;
-; RV64ZVBC32-LABEL: clmul_nxv1i16_vv:
+; RV64ZVBC32-LABEL: clmul_nxv16i8_vv:
 ; RV64ZVBC32:       # %bb.0:
-; RV64ZVBC32-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; RV64ZVBC32-NEXT:    vand.vi v10, v9, 2
-; RV64ZVBC32-NEXT:    vand.vi v11, v9, 1
-; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v11, v10
-; RV64ZVBC32-NEXT:    vand.vi v11, v9, 4
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vi v11, v9, 8
-; RV64ZVBC32-NEXT:    li a0, 16
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 32
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 64
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 128
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 256
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 512
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 1024
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 1
-; RV64ZVBC32-NEXT:    slli a0, a0, 11
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 1
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 2
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 4
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 8
-; RV64ZVBC32-NEXT:    vand.vx v9, v9, a0
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vmul.vv v8, v8, v9
-; RV64ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; RV64ZVBC32-NEXT:    vclmul.vv v8, v8, v10
 ; RV64ZVBC32-NEXT:    ret
-  %v = call <vscale x 1 x i16> @llvm.clmul.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %vb)
-  ret <vscale x 1 x i16> %v
+  %v = call <vscale x 16 x i8> @llvm.clmul.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %vb)
+  ret <vscale x 16 x i8> %v
 }
 
-define <vscale x 1 x i16> @clmul_nxv1i16_vx(<vscale x 1 x i16> %va, i16 %b) nounwind {
-; RV32V-LABEL: clmul_nxv1i16_vx:
+define <vscale x 16 x i8> @clmul_nxv16i8_vx(<vscale x 16 x i8> %va, i8 %b) nounwind {
+; RV32V-LABEL: clmul_nxv16i8_vx:
 ; RV32V:       # %bb.0:
-; RV32V-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; RV32V-NEXT:    vmv.v.x v9, a0
+; RV32V-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
+; RV32V-NEXT:    vmv.v.x v10, a0
 ; RV32V-NEXT:    li a0, 16
-; RV32V-NEXT:    vand.vi v10, v9, 2
-; RV32V-NEXT:    vand.vi v11, v9, 1
-; RV32V-NEXT:    vmul.vv v10, v8, v10
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v11, v10
-; RV32V-NEXT:    vand.vi v11, v9, 4
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vi v11, v9, 8
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    vand.vi v12, v10, 2
+; RV32V-NEXT:    vand.vi v14, v10, 1
+; RV32V-NEXT:    vmul.vv v12, v8, v12
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v14, v12
+; RV32V-NEXT:    vand.vi v14, v10, 4
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vi v14, v10, 8
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
 ; RV32V-NEXT:    li a0, 32
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
 ; RV32V-NEXT:    li a0, 64
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
 ; RV32V-NEXT:    li a0, 128
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
-; RV32V-NEXT:    li a0, 256
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
-; RV32V-NEXT:    li a0, 512
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
-; RV32V-NEXT:    li a0, 1024
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
-; RV32V-NEXT:    li a0, 1
-; RV32V-NEXT:    slli a0, a0, 11
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
-; RV32V-NEXT:    lui a0, 1
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
-; RV32V-NEXT:    lui a0, 2
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
-; RV32V-NEXT:    lui a0, 4
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
-; RV32V-NEXT:    lui a0, 8
-; RV32V-NEXT:    vand.vx v9, v9, a0
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vmul.vv v8, v8, v9
-; RV32V-NEXT:    vxor.vv v8, v10, v8
+; RV32V-NEXT:    vand.vx v10, v10, a0
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vmul.vv v8, v8, v10
+; RV32V-NEXT:    vxor.vv v8, v12, v8
 ; RV32V-NEXT:    ret
 ;
-; RV64V-LABEL: clmul_nxv1i16_vx:
+; RV64V-LABEL: clmul_nxv16i8_vx:
 ; RV64V:       # %bb.0:
-; RV64V-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; RV64V-NEXT:    vmv.v.x v9, a0
+; RV64V-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
+; RV64V-NEXT:    vmv.v.x v10, a0
 ; RV64V-NEXT:    li a0, 16
-; RV64V-NEXT:    vand.vi v10, v9, 2
-; RV64V-NEXT:    vand.vi v11, v9, 1
-; RV64V-NEXT:    vmul.vv v10, v8, v10
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v11, v10
-; RV64V-NEXT:    vand.vi v11, v9, 4
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vi v11, v9, 8
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    vand.vi v12, v10, 2
+; RV64V-NEXT:    vand.vi v14, v10, 1
+; RV64V-NEXT:    vmul.vv v12, v8, v12
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v14, v12
+; RV64V-NEXT:    vand.vi v14, v10, 4
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vi v14, v10, 8
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
 ; RV64V-NEXT:    li a0, 32
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
 ; RV64V-NEXT:    li a0, 64
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
 ; RV64V-NEXT:    li a0, 128
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    li a0, 256
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    li a0, 512
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    li a0, 1024
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    li a0, 1
-; RV64V-NEXT:    slli a0, a0, 11
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 1
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 2
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 4
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 8
-; RV64V-NEXT:    vand.vx v9, v9, a0
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vmul.vv v8, v8, v9
-; RV64V-NEXT:    vxor.vv v8, v10, v8
+; RV64V-NEXT:    vand.vx v10, v10, a0
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vmul.vv v8, v8, v10
+; RV64V-NEXT:    vxor.vv v8, v12, v8
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC64-LABEL: clmul_nxv1i16_vx:
+; RV32ZVBC64-LABEL: clmul_nxv16i8_vx:
 ; RV32ZVBC64:       # %bb.0:
-; RV32ZVBC64-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; RV32ZVBC64-NEXT:    vmv.v.x v9, a0
-; RV32ZVBC64-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
-; RV32ZVBC64-NEXT:    vzext.vf4 v10, v8
-; RV32ZVBC64-NEXT:    vzext.vf4 v8, v9
-; RV32ZVBC64-NEXT:    vclmul.vv v8, v10, v8
-; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; RV32ZVBC64-NEXT:    vnsrl.wi v8, v8, 0
-; RV32ZVBC64-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; RV32ZVBC64-NEXT:    vnsrl.wi v8, v8, 0
+; RV32ZVBC64-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
+; RV32ZVBC64-NEXT:    vmv.v.x v10, a0
+; RV32ZVBC64-NEXT:    li a0, 16
+; RV32ZVBC64-NEXT:    vand.vi v12, v10, 2
+; RV32ZVBC64-NEXT:    vand.vi v14, v10, 1
+; RV32ZVBC64-NEXT:    vmul.vv v12, v8, v12
+; RV32ZVBC64-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC64-NEXT:    vxor.vv v12, v14, v12
+; RV32ZVBC64-NEXT:    vand.vi v14, v10, 4
+; RV32ZVBC64-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC64-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC64-NEXT:    vand.vi v14, v10, 8
+; RV32ZVBC64-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC64-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC64-NEXT:    vand.vx v14, v10, a0
+; RV32ZVBC64-NEXT:    li a0, 32
+; RV32ZVBC64-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC64-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC64-NEXT:    vand.vx v14, v10, a0
+; RV32ZVBC64-NEXT:    li a0, 64
+; RV32ZVBC64-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC64-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC64-NEXT:    vand.vx v14, v10, a0
+; RV32ZVBC64-NEXT:    li a0, 128
+; RV32ZVBC64-NEXT:    vand.vx v10, v10, a0
+; RV32ZVBC64-NEXT:    vmul.vv v14, v8, v14
+; RV32ZVBC64-NEXT:    vxor.vv v12, v12, v14
+; RV32ZVBC64-NEXT:    vmul.vv v8, v8, v10
+; RV32ZVBC64-NEXT:    vxor.vv v8, v12, v8
 ; RV32ZVBC64-NEXT:    ret
 ;
-; RV64ZVBC64-LABEL: clmul_nxv1i16_vx:
+; RV64ZVBC64-LABEL: clmul_nxv16i8_vx:
 ; RV64ZVBC64:       # %bb.0:
-; RV64ZVBC64-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; RV64ZVBC64-NEXT:    vmv.v.x v9, a0
-; RV64ZVBC64-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
-; RV64ZVBC64-NEXT:    vzext.vf4 v10, v8
-; RV64ZVBC64-NEXT:    vzext.vf4 v8, v9
-; RV64ZVBC64-NEXT:    vclmul.vv v8, v10, v8
-; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; RV64ZVBC64-NEXT:    vnsrl.wi v8, v8, 0
-; RV64ZVBC64-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; RV64ZVBC64-NEXT:    vnsrl.wi v8, v8, 0
+; RV64ZVBC64-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
+; RV64ZVBC64-NEXT:    vmv.v.x v10, a0
+; RV64ZVBC64-NEXT:    li a0, 16
+; RV64ZVBC64-NEXT:    vand.vi v12, v10, 2
+; RV64ZVBC64-NEXT:    vand.vi v14, v10, 1
+; RV64ZVBC64-NEXT:    vmul.vv v12, v8, v12
+; RV64ZVBC64-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC64-NEXT:    vxor.vv v12, v14, v12
+; RV64ZVBC64-NEXT:    vand.vi v14, v10, 4
+; RV64ZVBC64-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC64-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC64-NEXT:    vand.vi v14, v10, 8
+; RV64ZVBC64-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC64-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC64-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC64-NEXT:    li a0, 32
+; RV64ZVBC64-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC64-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC64-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC64-NEXT:    li a0, 64
+; RV64ZVBC64-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC64-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC64-NEXT:    vand.vx v14, v10, a0
+; RV64ZVBC64-NEXT:    li a0, 128
+; RV64ZVBC64-NEXT:    vand.vx v10, v10, a0
+; RV64ZVBC64-NEXT:    vmul.vv v14, v8, v14
+; RV64ZVBC64-NEXT:    vxor.vv v12, v12, v14
+; RV64ZVBC64-NEXT:    vmul.vv v8, v8, v10
+; RV64ZVBC64-NEXT:    vxor.vv v8, v12, v8
 ; RV64ZVBC64-NEXT:    ret
 ;
-; RV32ZVBC32-LABEL: clmul_nxv1i16_vx:
+; RV32ZVBC32-LABEL: clmul_nxv16i8_vx:
 ; RV32ZVBC32:       # %bb.0:
-; RV32ZVBC32-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; RV32ZVBC32-NEXT:    vmv.v.x v9, a0
-; RV32ZVBC32-NEXT:    li a0, 16
-; RV32ZVBC32-NEXT:    vand.vi v10, v9, 2
-; RV32ZVBC32-NEXT:    vand.vi v11, v9, 1
-; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v10
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v11, v10
-; RV32ZVBC32-NEXT:    vand.vi v11, v9, 4
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vi v11, v9, 8
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 32
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 64
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 128
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 256
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 512
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 1024
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 1
-; RV32ZVBC32-NEXT:    slli a0, a0, 11
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    lui a0, 1
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    lui a0, 2
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    lui a0, 4
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    lui a0, 8
-; RV32ZVBC32-NEXT:    vand.vx v9, v9, a0
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vmul.vv v8, v8, v9
-; RV32ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
+; RV32ZVBC32-NEXT:    vclmul.vx v8, v8, a0
 ; RV32ZVBC32-NEXT:    ret
 ;
-; RV64ZVBC32-LABEL: clmul_nxv1i16_vx:
+; RV64ZVBC32-LABEL: clmul_nxv16i8_vx:
 ; RV64ZVBC32:       # %bb.0:
-; RV64ZVBC32-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; RV64ZVBC32-NEXT:    vmv.v.x v9, a0
-; RV64ZVBC32-NEXT:    li a0, 16
-; RV64ZVBC32-NEXT:    vand.vi v10, v9, 2
-; RV64ZVBC32-NEXT:    vand.vi v11, v9, 1
-; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v11, v10
-; RV64ZVBC32-NEXT:    vand.vi v11, v9, 4
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vi v11, v9, 8
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 32
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 64
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 128
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 256
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 512
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 1024
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 1
-; RV64ZVBC32-NEXT:    slli a0, a0, 11
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 1
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 2
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 4
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 8
-; RV64ZVBC32-NEXT:    vand.vx v9, v9, a0
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vmul.vv v8, v8, v9
-; RV64ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
+; RV64ZVBC32-NEXT:    vclmul.vx v8, v8, a0
 ; RV64ZVBC32-NEXT:    ret
-  %elt.head = insertelement <vscale x 1 x i16> poison, i16 %b, i32 0
-  %vb = shufflevector <vscale x 1 x i16> %elt.head, <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer
-  %v = call <vscale x 1 x i16> @llvm.clmul.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %vb)
-  ret <vscale x 1 x i16> %v
+  %elt.head = insertelement <vscale x 16 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i8> %elt.head, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i8> @llvm.clmul.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %vb)
+  ret <vscale x 16 x i8> %v
 }
 
-define <vscale x 2 x i16> @clmul_nxv2i16_vv(<vscale x 2 x i16> %va, <vscale x 2 x i16> %vb) nounwind {
-; RV32V-LABEL: clmul_nxv2i16_vv:
+define <vscale x 32 x i8> @clmul_nxv32i8_vv(<vscale x 32 x i8> %va, <vscale x 32 x i8> %vb) nounwind {
+; RV32V-LABEL: clmul_nxv32i8_vv:
 ; RV32V:       # %bb.0:
-; RV32V-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; RV32V-NEXT:    vand.vi v10, v9, 2
-; RV32V-NEXT:    vand.vi v11, v9, 1
-; RV32V-NEXT:    vmul.vv v10, v8, v10
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v11, v10
-; RV32V-NEXT:    vand.vi v11, v9, 4
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vi v11, v9, 8
+; RV32V-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
+; RV32V-NEXT:    vand.vi v16, v12, 2
+; RV32V-NEXT:    vand.vi v20, v12, 1
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v20, v16
+; RV32V-NEXT:    vand.vi v20, v12, 4
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vi v20, v12, 8
 ; RV32V-NEXT:    li a0, 16
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
 ; RV32V-NEXT:    li a0, 32
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
 ; RV32V-NEXT:    li a0, 64
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
 ; RV32V-NEXT:    li a0, 128
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
-; RV32V-NEXT:    li a0, 256
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
-; RV32V-NEXT:    li a0, 512
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
-; RV32V-NEXT:    li a0, 1024
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
-; RV32V-NEXT:    li a0, 1
-; RV32V-NEXT:    slli a0, a0, 11
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
-; RV32V-NEXT:    lui a0, 1
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
-; RV32V-NEXT:    lui a0, 2
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
-; RV32V-NEXT:    lui a0, 4
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
-; RV32V-NEXT:    lui a0, 8
-; RV32V-NEXT:    vand.vx v9, v9, a0
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vmul.vv v8, v8, v9
-; RV32V-NEXT:    vxor.vv v8, v10, v8
+; RV32V-NEXT:    vand.vx v12, v12, a0
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vmul.vv v8, v8, v12
+; RV32V-NEXT:    vxor.vv v8, v16, v8
 ; RV32V-NEXT:    ret
 ;
-; RV64V-LABEL: clmul_nxv2i16_vv:
+; RV64V-LABEL: clmul_nxv32i8_vv:
 ; RV64V:       # %bb.0:
-; RV64V-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; RV64V-NEXT:    vand.vi v10, v9, 2
-; RV64V-NEXT:    vand.vi v11, v9, 1
-; RV64V-NEXT:    vmul.vv v10, v8, v10
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v11, v10
-; RV64V-NEXT:    vand.vi v11, v9, 4
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vi v11, v9, 8
+; RV64V-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
+; RV64V-NEXT:    vand.vi v16, v12, 2
+; RV64V-NEXT:    vand.vi v20, v12, 1
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v20, v16
+; RV64V-NEXT:    vand.vi v20, v12, 4
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vi v20, v12, 8
 ; RV64V-NEXT:    li a0, 16
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
 ; RV64V-NEXT:    li a0, 32
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
 ; RV64V-NEXT:    li a0, 64
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
 ; RV64V-NEXT:    li a0, 128
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    li a0, 256
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    li a0, 512
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    li a0, 1024
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    li a0, 1
-; RV64V-NEXT:    slli a0, a0, 11
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 1
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 2
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 4
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 8
-; RV64V-NEXT:    vand.vx v9, v9, a0
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vmul.vv v8, v8, v9
-; RV64V-NEXT:    vxor.vv v8, v10, v8
+; RV64V-NEXT:    vand.vx v12, v12, a0
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vmul.vv v8, v8, v12
+; RV64V-NEXT:    vxor.vv v8, v16, v8
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC64-LABEL: clmul_nxv2i16_vv:
+; RV32ZVBC64-LABEL: clmul_nxv32i8_vv:
 ; RV32ZVBC64:       # %bb.0:
-; RV32ZVBC64-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV32ZVBC64-NEXT:    vzext.vf4 v10, v9
-; RV32ZVBC64-NEXT:    vzext.vf4 v12, v8
-; RV32ZVBC64-NEXT:    vclmul.vv v8, v12, v10
-; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV32ZVBC64-NEXT:    vnsrl.wi v10, v8, 0
-; RV32ZVBC64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; RV32ZVBC64-NEXT:    vnsrl.wi v8, v10, 0
+; RV32ZVBC64-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
+; RV32ZVBC64-NEXT:    vand.vi v16, v12, 2
+; RV32ZVBC64-NEXT:    vand.vi v20, v12, 1
+; RV32ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC64-NEXT:    vxor.vv v16, v20, v16
+; RV32ZVBC64-NEXT:    vand.vi v20, v12, 4
+; RV32ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC64-NEXT:    vand.vi v20, v12, 8
+; RV32ZVBC64-NEXT:    li a0, 16
+; RV32ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC64-NEXT:    vand.vx v20, v12, a0
+; RV32ZVBC64-NEXT:    li a0, 32
+; RV32ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC64-NEXT:    vand.vx v20, v12, a0
+; RV32ZVBC64-NEXT:    li a0, 64
+; RV32ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC64-NEXT:    vand.vx v20, v12, a0
+; RV32ZVBC64-NEXT:    li a0, 128
+; RV32ZVBC64-NEXT:    vand.vx v12, v12, a0
+; RV32ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC64-NEXT:    vmul.vv v8, v8, v12
+; RV32ZVBC64-NEXT:    vxor.vv v8, v16, v8
 ; RV32ZVBC64-NEXT:    ret
 ;
-; RV64ZVBC64-LABEL: clmul_nxv2i16_vv:
+; RV64ZVBC64-LABEL: clmul_nxv32i8_vv:
 ; RV64ZVBC64:       # %bb.0:
-; RV64ZVBC64-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV64ZVBC64-NEXT:    vzext.vf4 v10, v9
-; RV64ZVBC64-NEXT:    vzext.vf4 v12, v8
-; RV64ZVBC64-NEXT:    vclmul.vv v8, v12, v10
-; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVBC64-NEXT:    vnsrl.wi v10, v8, 0
-; RV64ZVBC64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; RV64ZVBC64-NEXT:    vnsrl.wi v8, v10, 0
+; RV64ZVBC64-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
+; RV64ZVBC64-NEXT:    vand.vi v16, v12, 2
+; RV64ZVBC64-NEXT:    vand.vi v20, v12, 1
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC64-NEXT:    vxor.vv v16, v20, v16
+; RV64ZVBC64-NEXT:    vand.vi v20, v12, 4
+; RV64ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC64-NEXT:    vand.vi v20, v12, 8
+; RV64ZVBC64-NEXT:    li a0, 16
+; RV64ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC64-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC64-NEXT:    li a0, 32
+; RV64ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC64-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC64-NEXT:    li a0, 64
+; RV64ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC64-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC64-NEXT:    li a0, 128
+; RV64ZVBC64-NEXT:    vand.vx v12, v12, a0
+; RV64ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC64-NEXT:    vmul.vv v8, v8, v12
+; RV64ZVBC64-NEXT:    vxor.vv v8, v16, v8
 ; RV64ZVBC64-NEXT:    ret
 ;
-; RV32ZVBC32-LABEL: clmul_nxv2i16_vv:
+; RV32ZVBC32-LABEL: clmul_nxv32i8_vv:
 ; RV32ZVBC32:       # %bb.0:
-; RV32ZVBC32-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; RV32ZVBC32-NEXT:    vand.vi v10, v9, 2
-; RV32ZVBC32-NEXT:    vand.vi v11, v9, 1
-; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v10
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v11, v10
-; RV32ZVBC32-NEXT:    vand.vi v11, v9, 4
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vi v11, v9, 8
-; RV32ZVBC32-NEXT:    li a0, 16
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 32
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 64
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 128
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 256
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 512
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 1024
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 1
-; RV32ZVBC32-NEXT:    slli a0, a0, 11
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    lui a0, 1
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    lui a0, 2
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    lui a0, 4
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    lui a0, 8
-; RV32ZVBC32-NEXT:    vand.vx v9, v9, a0
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vmul.vv v8, v8, v9
-; RV32ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
+; RV32ZVBC32-NEXT:    vclmul.vv v8, v8, v12
 ; RV32ZVBC32-NEXT:    ret
 ;
-; RV64ZVBC32-LABEL: clmul_nxv2i16_vv:
+; RV64ZVBC32-LABEL: clmul_nxv32i8_vv:
 ; RV64ZVBC32:       # %bb.0:
-; RV64ZVBC32-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; RV64ZVBC32-NEXT:    vand.vi v10, v9, 2
-; RV64ZVBC32-NEXT:    vand.vi v11, v9, 1
-; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v11, v10
-; RV64ZVBC32-NEXT:    vand.vi v11, v9, 4
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vi v11, v9, 8
-; RV64ZVBC32-NEXT:    li a0, 16
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 32
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 64
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 128
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 256
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 512
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 1024
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 1
-; RV64ZVBC32-NEXT:    slli a0, a0, 11
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 1
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 2
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 4
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 8
-; RV64ZVBC32-NEXT:    vand.vx v9, v9, a0
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vmul.vv v8, v8, v9
-; RV64ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
+; RV64ZVBC32-NEXT:    vclmul.vv v8, v8, v12
 ; RV64ZVBC32-NEXT:    ret
-  %v = call <vscale x 2 x i16> @llvm.clmul.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %vb)
-  ret <vscale x 2 x i16> %v
+  %v = call <vscale x 32 x i8> @llvm.clmul.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %vb)
+  ret <vscale x 32 x i8> %v
 }
 
-define <vscale x 2 x i16> @clmul_nxv2i16_vx(<vscale x 2 x i16> %va, i16 %b) nounwind {
-; RV32V-LABEL: clmul_nxv2i16_vx:
+define <vscale x 32 x i8> @clmul_nxv32i8_vx(<vscale x 32 x i8> %va, i8 %b) nounwind {
+; RV32V-LABEL: clmul_nxv32i8_vx:
 ; RV32V:       # %bb.0:
-; RV32V-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; RV32V-NEXT:    vmv.v.x v9, a0
+; RV32V-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; RV32V-NEXT:    vmv.v.x v12, a0
 ; RV32V-NEXT:    li a0, 16
-; RV32V-NEXT:    vand.vi v10, v9, 2
-; RV32V-NEXT:    vand.vi v11, v9, 1
-; RV32V-NEXT:    vmul.vv v10, v8, v10
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v11, v10
-; RV32V-NEXT:    vand.vi v11, v9, 4
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vi v11, v9, 8
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    vand.vi v16, v12, 2
+; RV32V-NEXT:    vand.vi v20, v12, 1
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v20, v16
+; RV32V-NEXT:    vand.vi v20, v12, 4
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vi v20, v12, 8
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
 ; RV32V-NEXT:    li a0, 32
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
 ; RV32V-NEXT:    li a0, 64
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
 ; RV32V-NEXT:    li a0, 128
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
-; RV32V-NEXT:    li a0, 256
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
-; RV32V-NEXT:    li a0, 512
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
-; RV32V-NEXT:    li a0, 1024
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
-; RV32V-NEXT:    li a0, 1
-; RV32V-NEXT:    slli a0, a0, 11
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
-; RV32V-NEXT:    lui a0, 1
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
-; RV32V-NEXT:    lui a0, 2
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
-; RV32V-NEXT:    lui a0, 4
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
-; RV32V-NEXT:    lui a0, 8
-; RV32V-NEXT:    vand.vx v9, v9, a0
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vmul.vv v8, v8, v9
-; RV32V-NEXT:    vxor.vv v8, v10, v8
+; RV32V-NEXT:    vand.vx v12, v12, a0
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vmul.vv v8, v8, v12
+; RV32V-NEXT:    vxor.vv v8, v16, v8
 ; RV32V-NEXT:    ret
 ;
-; RV64V-LABEL: clmul_nxv2i16_vx:
+; RV64V-LABEL: clmul_nxv32i8_vx:
 ; RV64V:       # %bb.0:
-; RV64V-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; RV64V-NEXT:    vmv.v.x v9, a0
+; RV64V-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; RV64V-NEXT:    vmv.v.x v12, a0
 ; RV64V-NEXT:    li a0, 16
-; RV64V-NEXT:    vand.vi v10, v9, 2
-; RV64V-NEXT:    vand.vi v11, v9, 1
-; RV64V-NEXT:    vmul.vv v10, v8, v10
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v11, v10
-; RV64V-NEXT:    vand.vi v11, v9, 4
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vi v11, v9, 8
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    vand.vi v16, v12, 2
+; RV64V-NEXT:    vand.vi v20, v12, 1
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v20, v16
+; RV64V-NEXT:    vand.vi v20, v12, 4
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vi v20, v12, 8
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
 ; RV64V-NEXT:    li a0, 32
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
 ; RV64V-NEXT:    li a0, 64
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
 ; RV64V-NEXT:    li a0, 128
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    li a0, 256
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    li a0, 512
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    li a0, 1024
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    li a0, 1
-; RV64V-NEXT:    slli a0, a0, 11
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 1
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 2
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 4
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 8
-; RV64V-NEXT:    vand.vx v9, v9, a0
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vmul.vv v8, v8, v9
-; RV64V-NEXT:    vxor.vv v8, v10, v8
+; RV64V-NEXT:    vand.vx v12, v12, a0
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vmul.vv v8, v8, v12
+; RV64V-NEXT:    vxor.vv v8, v16, v8
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC64-LABEL: clmul_nxv2i16_vx:
+; RV32ZVBC64-LABEL: clmul_nxv32i8_vx:
 ; RV32ZVBC64:       # %bb.0:
-; RV32ZVBC64-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; RV32ZVBC64-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
 ; RV32ZVBC64-NEXT:    vmv.v.x v12, a0
-; RV32ZVBC64-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; RV32ZVBC64-NEXT:    vzext.vf4 v10, v8
-; RV32ZVBC64-NEXT:    vzext.vf4 v8, v12
-; RV32ZVBC64-NEXT:    vclmul.vv v8, v10, v8
-; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV32ZVBC64-NEXT:    vnsrl.wi v10, v8, 0
-; RV32ZVBC64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; RV32ZVBC64-NEXT:    vnsrl.wi v8, v10, 0
+; RV32ZVBC64-NEXT:    li a0, 16
+; RV32ZVBC64-NEXT:    vand.vi v16, v12, 2
+; RV32ZVBC64-NEXT:    vand.vi v20, v12, 1
+; RV32ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC64-NEXT:    vxor.vv v16, v20, v16
+; RV32ZVBC64-NEXT:    vand.vi v20, v12, 4
+; RV32ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC64-NEXT:    vand.vi v20, v12, 8
+; RV32ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC64-NEXT:    vand.vx v20, v12, a0
+; RV32ZVBC64-NEXT:    li a0, 32
+; RV32ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC64-NEXT:    vand.vx v20, v12, a0
+; RV32ZVBC64-NEXT:    li a0, 64
+; RV32ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC64-NEXT:    vand.vx v20, v12, a0
+; RV32ZVBC64-NEXT:    li a0, 128
+; RV32ZVBC64-NEXT:    vand.vx v12, v12, a0
+; RV32ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC64-NEXT:    vmul.vv v8, v8, v12
+; RV32ZVBC64-NEXT:    vxor.vv v8, v16, v8
 ; RV32ZVBC64-NEXT:    ret
 ;
-; RV64ZVBC64-LABEL: clmul_nxv2i16_vx:
+; RV64ZVBC64-LABEL: clmul_nxv32i8_vx:
 ; RV64ZVBC64:       # %bb.0:
-; RV64ZVBC64-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; RV64ZVBC64-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
 ; RV64ZVBC64-NEXT:    vmv.v.x v12, a0
-; RV64ZVBC64-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; RV64ZVBC64-NEXT:    vzext.vf4 v10, v8
-; RV64ZVBC64-NEXT:    vzext.vf4 v8, v12
-; RV64ZVBC64-NEXT:    vclmul.vv v8, v10, v8
-; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVBC64-NEXT:    vnsrl.wi v10, v8, 0
-; RV64ZVBC64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; RV64ZVBC64-NEXT:    vnsrl.wi v8, v10, 0
+; RV64ZVBC64-NEXT:    li a0, 16
+; RV64ZVBC64-NEXT:    vand.vi v16, v12, 2
+; RV64ZVBC64-NEXT:    vand.vi v20, v12, 1
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC64-NEXT:    vxor.vv v16, v20, v16
+; RV64ZVBC64-NEXT:    vand.vi v20, v12, 4
+; RV64ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC64-NEXT:    vand.vi v20, v12, 8
+; RV64ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC64-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC64-NEXT:    li a0, 32
+; RV64ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC64-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC64-NEXT:    li a0, 64
+; RV64ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC64-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC64-NEXT:    li a0, 128
+; RV64ZVBC64-NEXT:    vand.vx v12, v12, a0
+; RV64ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC64-NEXT:    vmul.vv v8, v8, v12
+; RV64ZVBC64-NEXT:    vxor.vv v8, v16, v8
 ; RV64ZVBC64-NEXT:    ret
 ;
-; RV32ZVBC32-LABEL: clmul_nxv2i16_vx:
+; RV32ZVBC32-LABEL: clmul_nxv32i8_vx:
 ; RV32ZVBC32:       # %bb.0:
-; RV32ZVBC32-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; RV32ZVBC32-NEXT:    vmv.v.x v9, a0
-; RV32ZVBC32-NEXT:    li a0, 16
-; RV32ZVBC32-NEXT:    vand.vi v10, v9, 2
-; RV32ZVBC32-NEXT:    vand.vi v11, v9, 1
-; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v10
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v11, v10
-; RV32ZVBC32-NEXT:    vand.vi v11, v9, 4
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vi v11, v9, 8
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 32
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 64
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 128
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 256
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 512
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 1024
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 1
-; RV32ZVBC32-NEXT:    slli a0, a0, 11
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    lui a0, 1
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    lui a0, 2
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    lui a0, 4
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    lui a0, 8
-; RV32ZVBC32-NEXT:    vand.vx v9, v9, a0
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vmul.vv v8, v8, v9
-; RV32ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; RV32ZVBC32-NEXT:    vclmul.vx v8, v8, a0
 ; RV32ZVBC32-NEXT:    ret
 ;
-; RV64ZVBC32-LABEL: clmul_nxv2i16_vx:
+; RV64ZVBC32-LABEL: clmul_nxv32i8_vx:
 ; RV64ZVBC32:       # %bb.0:
-; RV64ZVBC32-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; RV64ZVBC32-NEXT:    vmv.v.x v9, a0
-; RV64ZVBC32-NEXT:    li a0, 16
-; RV64ZVBC32-NEXT:    vand.vi v10, v9, 2
-; RV64ZVBC32-NEXT:    vand.vi v11, v9, 1
-; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v11, v10
-; RV64ZVBC32-NEXT:    vand.vi v11, v9, 4
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vi v11, v9, 8
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 32
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 64
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 128
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 256
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 512
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 1024
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 1
-; RV64ZVBC32-NEXT:    slli a0, a0, 11
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 1
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 2
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 4
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 8
-; RV64ZVBC32-NEXT:    vand.vx v9, v9, a0
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vmul.vv v8, v8, v9
-; RV64ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; RV64ZVBC32-NEXT:    vclmul.vx v8, v8, a0
 ; RV64ZVBC32-NEXT:    ret
-  %elt.head = insertelement <vscale x 2 x i16> poison, i16 %b, i32 0
-  %vb = shufflevector <vscale x 2 x i16> %elt.head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
-  %v = call <vscale x 2 x i16> @llvm.clmul.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %vb)
-  ret <vscale x 2 x i16> %v
+  %elt.head = insertelement <vscale x 32 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 32 x i8> %elt.head, <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i8> @llvm.clmul.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %vb)
+  ret <vscale x 32 x i8> %v
 }
 
-define <vscale x 4 x i16> @clmul_nxv4i16_vv(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb) nounwind {
-; RV32V-LABEL: clmul_nxv4i16_vv:
+define <vscale x 64 x i8> @clmul_nxv64i8_vv(<vscale x 64 x i8> %va, <vscale x 64 x i8> %vb) nounwind {
+; RV32V-LABEL: clmul_nxv64i8_vv:
 ; RV32V:       # %bb.0:
-; RV32V-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; RV32V-NEXT:    vand.vi v10, v9, 2
-; RV32V-NEXT:    vand.vi v11, v9, 1
-; RV32V-NEXT:    vmul.vv v10, v8, v10
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v11, v10
-; RV32V-NEXT:    vand.vi v11, v9, 4
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vi v11, v9, 8
+; RV32V-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
+; RV32V-NEXT:    vand.vi v24, v16, 2
+; RV32V-NEXT:    vand.vi v0, v16, 1
+; RV32V-NEXT:    vmul.vv v24, v8, v24
+; RV32V-NEXT:    vmul.vv v0, v8, v0
+; RV32V-NEXT:    vxor.vv v24, v0, v24
+; RV32V-NEXT:    vand.vi v0, v16, 4
+; RV32V-NEXT:    vmul.vv v0, v8, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vi v0, v16, 8
 ; RV32V-NEXT:    li a0, 16
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    vmul.vv v0, v8, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v16, a0
 ; RV32V-NEXT:    li a0, 32
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    vmul.vv v0, v8, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v16, a0
 ; RV32V-NEXT:    li a0, 64
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    vmul.vv v0, v8, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v16, a0
 ; RV32V-NEXT:    li a0, 128
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
-; RV32V-NEXT:    li a0, 256
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
-; RV32V-NEXT:    li a0, 512
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
-; RV32V-NEXT:    li a0, 1024
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
-; RV32V-NEXT:    li a0, 1
-; RV32V-NEXT:    slli a0, a0, 11
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
-; RV32V-NEXT:    lui a0, 1
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
-; RV32V-NEXT:    lui a0, 2
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
-; RV32V-NEXT:    lui a0, 4
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
-; RV32V-NEXT:    lui a0, 8
-; RV32V-NEXT:    vand.vx v9, v9, a0
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vmul.vv v8, v8, v9
-; RV32V-NEXT:    vxor.vv v8, v10, v8
+; RV32V-NEXT:    vand.vx v16, v16, a0
+; RV32V-NEXT:    vmul.vv v0, v8, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vmul.vv v8, v8, v16
+; RV32V-NEXT:    vxor.vv v8, v24, v8
 ; RV32V-NEXT:    ret
 ;
-; RV64V-LABEL: clmul_nxv4i16_vv:
+; RV64V-LABEL: clmul_nxv64i8_vv:
 ; RV64V:       # %bb.0:
-; RV64V-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; RV64V-NEXT:    vand.vi v10, v9, 2
-; RV64V-NEXT:    vand.vi v11, v9, 1
-; RV64V-NEXT:    vmul.vv v10, v8, v10
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v11, v10
-; RV64V-NEXT:    vand.vi v11, v9, 4
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vi v11, v9, 8
+; RV64V-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
+; RV64V-NEXT:    vand.vi v24, v16, 2
+; RV64V-NEXT:    vand.vi v0, v16, 1
+; RV64V-NEXT:    vmul.vv v24, v8, v24
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v0, v24
+; RV64V-NEXT:    vand.vi v0, v16, 4
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vi v0, v16, 8
 ; RV64V-NEXT:    li a0, 16
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
 ; RV64V-NEXT:    li a0, 32
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
 ; RV64V-NEXT:    li a0, 64
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
 ; RV64V-NEXT:    li a0, 128
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    li a0, 256
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    li a0, 512
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    li a0, 1024
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    li a0, 1
-; RV64V-NEXT:    slli a0, a0, 11
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 1
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 2
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 4
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 8
-; RV64V-NEXT:    vand.vx v9, v9, a0
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vmul.vv v8, v8, v9
-; RV64V-NEXT:    vxor.vv v8, v10, v8
+; RV64V-NEXT:    vand.vx v16, v16, a0
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vmul.vv v8, v8, v16
+; RV64V-NEXT:    vxor.vv v8, v24, v8
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC64-LABEL: clmul_nxv4i16_vv:
+; RV32ZVBC64-LABEL: clmul_nxv64i8_vv:
 ; RV32ZVBC64:       # %bb.0:
-; RV32ZVBC64-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV32ZVBC64-NEXT:    vzext.vf4 v12, v9
-; RV32ZVBC64-NEXT:    vzext.vf4 v16, v8
-; RV32ZVBC64-NEXT:    vclmul.vv v8, v16, v12
-; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV32ZVBC64-NEXT:    vnsrl.wi v12, v8, 0
-; RV32ZVBC64-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; RV32ZVBC64-NEXT:    vnsrl.wi v8, v12, 0
+; RV32ZVBC64-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
+; RV32ZVBC64-NEXT:    vand.vi v24, v16, 2
+; RV32ZVBC64-NEXT:    vand.vi v0, v16, 1
+; RV32ZVBC64-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v0, v24
+; RV32ZVBC64-NEXT:    vand.vi v0, v16, 4
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vi v0, v16, 8
+; RV32ZVBC64-NEXT:    li a0, 16
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV32ZVBC64-NEXT:    li a0, 32
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV32ZVBC64-NEXT:    li a0, 64
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV32ZVBC64-NEXT:    li a0, 128
+; RV32ZVBC64-NEXT:    vand.vx v16, v16, a0
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vmul.vv v8, v8, v16
+; RV32ZVBC64-NEXT:    vxor.vv v8, v24, v8
 ; RV32ZVBC64-NEXT:    ret
 ;
-; RV64ZVBC64-LABEL: clmul_nxv4i16_vv:
+; RV64ZVBC64-LABEL: clmul_nxv64i8_vv:
 ; RV64ZVBC64:       # %bb.0:
-; RV64ZVBC64-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV64ZVBC64-NEXT:    vzext.vf4 v12, v9
-; RV64ZVBC64-NEXT:    vzext.vf4 v16, v8
-; RV64ZVBC64-NEXT:    vclmul.vv v8, v16, v12
-; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV64ZVBC64-NEXT:    vnsrl.wi v12, v8, 0
-; RV64ZVBC64-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; RV64ZVBC64-NEXT:    vnsrl.wi v8, v12, 0
+; RV64ZVBC64-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
+; RV64ZVBC64-NEXT:    vand.vi v24, v16, 2
+; RV64ZVBC64-NEXT:    vand.vi v0, v16, 1
+; RV64ZVBC64-NEXT:    vmul.vv v24, v8, v24
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v0, v24
+; RV64ZVBC64-NEXT:    vand.vi v0, v16, 4
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vi v0, v16, 8
+; RV64ZVBC64-NEXT:    li a0, 16
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    li a0, 32
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    li a0, 64
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    li a0, 128
+; RV64ZVBC64-NEXT:    vand.vx v16, v16, a0
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vmul.vv v8, v8, v16
+; RV64ZVBC64-NEXT:    vxor.vv v8, v24, v8
 ; RV64ZVBC64-NEXT:    ret
 ;
-; RV32ZVBC32-LABEL: clmul_nxv4i16_vv:
+; RV32ZVBC32-LABEL: clmul_nxv64i8_vv:
 ; RV32ZVBC32:       # %bb.0:
-; RV32ZVBC32-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; RV32ZVBC32-NEXT:    vand.vi v10, v9, 2
-; RV32ZVBC32-NEXT:    vand.vi v11, v9, 1
-; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v10
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v11, v10
-; RV32ZVBC32-NEXT:    vand.vi v11, v9, 4
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vi v11, v9, 8
-; RV32ZVBC32-NEXT:    li a0, 16
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 32
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 64
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 128
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 256
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 512
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 1024
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 1
-; RV32ZVBC32-NEXT:    slli a0, a0, 11
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    lui a0, 1
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    lui a0, 2
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    lui a0, 4
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    lui a0, 8
-; RV32ZVBC32-NEXT:    vand.vx v9, v9, a0
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vmul.vv v8, v8, v9
-; RV32ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
+; RV32ZVBC32-NEXT:    vclmul.vv v8, v8, v16
 ; RV32ZVBC32-NEXT:    ret
 ;
-; RV64ZVBC32-LABEL: clmul_nxv4i16_vv:
+; RV64ZVBC32-LABEL: clmul_nxv64i8_vv:
 ; RV64ZVBC32:       # %bb.0:
-; RV64ZVBC32-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; RV64ZVBC32-NEXT:    vand.vi v10, v9, 2
-; RV64ZVBC32-NEXT:    vand.vi v11, v9, 1
-; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v11, v10
-; RV64ZVBC32-NEXT:    vand.vi v11, v9, 4
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vi v11, v9, 8
-; RV64ZVBC32-NEXT:    li a0, 16
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 32
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 64
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 128
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 256
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 512
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 1024
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 1
-; RV64ZVBC32-NEXT:    slli a0, a0, 11
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 1
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 2
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 4
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 8
-; RV64ZVBC32-NEXT:    vand.vx v9, v9, a0
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vmul.vv v8, v8, v9
-; RV64ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
+; RV64ZVBC32-NEXT:    vclmul.vv v8, v8, v16
 ; RV64ZVBC32-NEXT:    ret
-  %v = call <vscale x 4 x i16> @llvm.clmul.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb)
-  ret <vscale x 4 x i16> %v
+  %v = call <vscale x 64 x i8> @llvm.clmul.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %vb)
+  ret <vscale x 64 x i8> %v
 }
 
-define <vscale x 4 x i16> @clmul_nxv4i16_vx(<vscale x 4 x i16> %va, i16 %b) nounwind {
-; RV32V-LABEL: clmul_nxv4i16_vx:
+define <vscale x 64 x i8> @clmul_nxv64i8_vx(<vscale x 64 x i8> %va, i8 %b) nounwind {
+; RV32V-LABEL: clmul_nxv64i8_vx:
 ; RV32V:       # %bb.0:
-; RV32V-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; RV32V-NEXT:    vmv.v.x v9, a0
+; RV32V-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
+; RV32V-NEXT:    vmv.v.x v16, a0
 ; RV32V-NEXT:    li a0, 16
+; RV32V-NEXT:    vand.vi v24, v16, 2
+; RV32V-NEXT:    vand.vi v0, v16, 1
+; RV32V-NEXT:    vmul.vv v24, v8, v24
+; RV32V-NEXT:    vmul.vv v0, v8, v0
+; RV32V-NEXT:    vxor.vv v24, v0, v24
+; RV32V-NEXT:    vand.vi v0, v16, 4
+; RV32V-NEXT:    vmul.vv v0, v8, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vi v0, v16, 8
+; RV32V-NEXT:    vmul.vv v0, v8, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v16, a0
+; RV32V-NEXT:    li a0, 32
+; RV32V-NEXT:    vmul.vv v0, v8, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v16, a0
+; RV32V-NEXT:    li a0, 64
+; RV32V-NEXT:    vmul.vv v0, v8, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v16, a0
+; RV32V-NEXT:    li a0, 128
+; RV32V-NEXT:    vand.vx v16, v16, a0
+; RV32V-NEXT:    vmul.vv v0, v8, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vmul.vv v8, v8, v16
+; RV32V-NEXT:    vxor.vv v8, v24, v8
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: clmul_nxv64i8_vx:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
+; RV64V-NEXT:    vmv.v.x v16, a0
+; RV64V-NEXT:    li a0, 16
+; RV64V-NEXT:    vand.vi v24, v16, 2
+; RV64V-NEXT:    vand.vi v0, v16, 1
+; RV64V-NEXT:    vmul.vv v24, v8, v24
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v0, v24
+; RV64V-NEXT:    vand.vi v0, v16, 4
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vi v0, v16, 8
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
+; RV64V-NEXT:    li a0, 32
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
+; RV64V-NEXT:    li a0, 64
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
+; RV64V-NEXT:    li a0, 128
+; RV64V-NEXT:    vand.vx v16, v16, a0
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vmul.vv v8, v8, v16
+; RV64V-NEXT:    vxor.vv v8, v24, v8
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC64-LABEL: clmul_nxv64i8_vx:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
+; RV32ZVBC64-NEXT:    vmv.v.x v16, a0
+; RV32ZVBC64-NEXT:    li a0, 16
+; RV32ZVBC64-NEXT:    vand.vi v24, v16, 2
+; RV32ZVBC64-NEXT:    vand.vi v0, v16, 1
+; RV32ZVBC64-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v0, v24
+; RV32ZVBC64-NEXT:    vand.vi v0, v16, 4
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vi v0, v16, 8
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV32ZVBC64-NEXT:    li a0, 32
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV32ZVBC64-NEXT:    li a0, 64
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV32ZVBC64-NEXT:    li a0, 128
+; RV32ZVBC64-NEXT:    vand.vx v16, v16, a0
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vmul.vv v8, v8, v16
+; RV32ZVBC64-NEXT:    vxor.vv v8, v24, v8
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmul_nxv64i8_vx:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
+; RV64ZVBC64-NEXT:    vmv.v.x v16, a0
+; RV64ZVBC64-NEXT:    li a0, 16
+; RV64ZVBC64-NEXT:    vand.vi v24, v16, 2
+; RV64ZVBC64-NEXT:    vand.vi v0, v16, 1
+; RV64ZVBC64-NEXT:    vmul.vv v24, v8, v24
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v0, v24
+; RV64ZVBC64-NEXT:    vand.vi v0, v16, 4
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vi v0, v16, 8
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    li a0, 32
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    li a0, 64
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    li a0, 128
+; RV64ZVBC64-NEXT:    vand.vx v16, v16, a0
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vmul.vv v8, v8, v16
+; RV64ZVBC64-NEXT:    vxor.vv v8, v24, v8
+; RV64ZVBC64-NEXT:    ret
+;
+; RV32ZVBC32-LABEL: clmul_nxv64i8_vx:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
+; RV32ZVBC32-NEXT:    vclmul.vx v8, v8, a0
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmul_nxv64i8_vx:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
+; RV64ZVBC32-NEXT:    vclmul.vx v8, v8, a0
+; RV64ZVBC32-NEXT:    ret
+  %elt.head = insertelement <vscale x 64 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 64 x i8> %elt.head, <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer
+  %v = call <vscale x 64 x i8> @llvm.clmul.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %vb)
+  ret <vscale x 64 x i8> %v
+}
+
+define <vscale x 1 x i16> @clmul_nxv1i16_vv(<vscale x 1 x i16> %va, <vscale x 1 x i16> %vb) nounwind {
+; RV32V-LABEL: clmul_nxv1i16_vv:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; RV32V-NEXT:    vand.vi v10, v9, 2
 ; RV32V-NEXT:    vand.vi v11, v9, 1
 ; RV32V-NEXT:    vmul.vv v10, v8, v10
@@ -3026,6 +1797,7 @@ define <vscale x 4 x i16> @clmul_nxv4i16_vx(<vscale x 4 x i16> %va, i16 %b) noun
 ; RV32V-NEXT:    vmul.vv v11, v8, v11
 ; RV32V-NEXT:    vxor.vv v10, v10, v11
 ; RV32V-NEXT:    vand.vi v11, v9, 8
+; RV32V-NEXT:    li a0, 16
 ; RV32V-NEXT:    vmul.vv v11, v8, v11
 ; RV32V-NEXT:    vxor.vv v10, v10, v11
 ; RV32V-NEXT:    vand.vx v11, v9, a0
@@ -3078,11 +1850,9 @@ define <vscale x 4 x i16> @clmul_nxv4i16_vx(<vscale x 4 x i16> %va, i16 %b) noun
 ; RV32V-NEXT:    vxor.vv v8, v10, v8
 ; RV32V-NEXT:    ret
 ;
-; RV64V-LABEL: clmul_nxv4i16_vx:
+; RV64V-LABEL: clmul_nxv1i16_vv:
 ; RV64V:       # %bb.0:
-; RV64V-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; RV64V-NEXT:    vmv.v.x v9, a0
-; RV64V-NEXT:    li a0, 16
+; RV64V-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; RV64V-NEXT:    vand.vi v10, v9, 2
 ; RV64V-NEXT:    vand.vi v11, v9, 1
 ; RV64V-NEXT:    vmul.vv v10, v8, v10
@@ -3092,6 +1862,7 @@ define <vscale x 4 x i16> @clmul_nxv4i16_vx(<vscale x 4 x i16> %va, i16 %b) noun
 ; RV64V-NEXT:    vmul.vv v11, v8, v11
 ; RV64V-NEXT:    vxor.vv v10, v10, v11
 ; RV64V-NEXT:    vand.vi v11, v9, 8
+; RV64V-NEXT:    li a0, 16
 ; RV64V-NEXT:    vmul.vv v11, v8, v11
 ; RV64V-NEXT:    vxor.vv v10, v10, v11
 ; RV64V-NEXT:    vand.vx v11, v9, a0
@@ -3144,1043 +1915,399 @@ define <vscale x 4 x i16> @clmul_nxv4i16_vx(<vscale x 4 x i16> %va, i16 %b) noun
 ; RV64V-NEXT:    vxor.vv v8, v10, v8
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC64-LABEL: clmul_nxv4i16_vx:
+; RV32ZVBC64-LABEL: clmul_nxv1i16_vv:
 ; RV32ZVBC64:       # %bb.0:
-; RV32ZVBC64-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; RV32ZVBC64-NEXT:    vmv.v.x v16, a0
-; RV32ZVBC64-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
-; RV32ZVBC64-NEXT:    vzext.vf4 v12, v8
-; RV32ZVBC64-NEXT:    vzext.vf4 v8, v16
-; RV32ZVBC64-NEXT:    vclmul.vv v8, v12, v8
-; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV32ZVBC64-NEXT:    vnsrl.wi v12, v8, 0
-; RV32ZVBC64-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; RV32ZVBC64-NEXT:    vnsrl.wi v8, v12, 0
-; RV32ZVBC64-NEXT:    ret
-;
-; RV64ZVBC64-LABEL: clmul_nxv4i16_vx:
-; RV64ZVBC64:       # %bb.0:
-; RV64ZVBC64-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; RV64ZVBC64-NEXT:    vmv.v.x v16, a0
-; RV64ZVBC64-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
-; RV64ZVBC64-NEXT:    vzext.vf4 v12, v8
-; RV64ZVBC64-NEXT:    vzext.vf4 v8, v16
-; RV64ZVBC64-NEXT:    vclmul.vv v8, v12, v8
-; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV64ZVBC64-NEXT:    vnsrl.wi v12, v8, 0
-; RV64ZVBC64-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; RV64ZVBC64-NEXT:    vnsrl.wi v8, v12, 0
+; RV32ZVBC64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf4 v10, v9
+; RV32ZVBC64-NEXT:    vzext.vf4 v9, v8
+; RV32ZVBC64-NEXT:    vclmul.vv v8, v9, v10
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v8, 0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v8, 0
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmul_nxv1i16_vv:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf4 v10, v9
+; RV64ZVBC64-NEXT:    vzext.vf4 v9, v8
+; RV64ZVBC64-NEXT:    vclmul.vv v8, v9, v10
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v8, 0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v8, 0
 ; RV64ZVBC64-NEXT:    ret
 ;
-; RV32ZVBC32-LABEL: clmul_nxv4i16_vx:
+; RV32ZVBC32-LABEL: clmul_nxv1i16_vv:
 ; RV32ZVBC32:       # %bb.0:
-; RV32ZVBC32-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; RV32ZVBC32-NEXT:    vmv.v.x v9, a0
-; RV32ZVBC32-NEXT:    li a0, 16
-; RV32ZVBC32-NEXT:    vand.vi v10, v9, 2
-; RV32ZVBC32-NEXT:    vand.vi v11, v9, 1
-; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v10
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v11, v10
-; RV32ZVBC32-NEXT:    vand.vi v11, v9, 4
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vi v11, v9, 8
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 32
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 64
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 128
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 256
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 512
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 1024
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 1
-; RV32ZVBC32-NEXT:    slli a0, a0, 11
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    lui a0, 1
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    lui a0, 2
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    lui a0, 4
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    lui a0, 8
-; RV32ZVBC32-NEXT:    vand.vx v9, v9, a0
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vmul.vv v8, v8, v9
-; RV32ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; RV32ZVBC32-NEXT:    vclmul.vv v8, v8, v9
 ; RV32ZVBC32-NEXT:    ret
 ;
-; RV64ZVBC32-LABEL: clmul_nxv4i16_vx:
+; RV64ZVBC32-LABEL: clmul_nxv1i16_vv:
 ; RV64ZVBC32:       # %bb.0:
-; RV64ZVBC32-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; RV64ZVBC32-NEXT:    vmv.v.x v9, a0
-; RV64ZVBC32-NEXT:    li a0, 16
-; RV64ZVBC32-NEXT:    vand.vi v10, v9, 2
-; RV64ZVBC32-NEXT:    vand.vi v11, v9, 1
-; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v11, v10
-; RV64ZVBC32-NEXT:    vand.vi v11, v9, 4
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vi v11, v9, 8
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 32
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 64
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 128
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 256
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 512
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 1024
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 1
-; RV64ZVBC32-NEXT:    slli a0, a0, 11
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 1
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 2
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 4
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 8
-; RV64ZVBC32-NEXT:    vand.vx v9, v9, a0
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vmul.vv v8, v8, v9
-; RV64ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; RV64ZVBC32-NEXT:    vclmul.vv v8, v8, v9
 ; RV64ZVBC32-NEXT:    ret
-  %elt.head = insertelement <vscale x 4 x i16> poison, i16 %b, i32 0
-  %vb = shufflevector <vscale x 4 x i16> %elt.head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
-  %v = call <vscale x 4 x i16> @llvm.clmul.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb)
-  ret <vscale x 4 x i16> %v
+  %v = call <vscale x 1 x i16> @llvm.clmul.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %vb)
+  ret <vscale x 1 x i16> %v
 }
 
-define <vscale x 8 x i16> @clmul_nxv8i16_vv(<vscale x 8 x i16> %va, <vscale x 8 x i16> %vb) nounwind {
-; RV32V-LABEL: clmul_nxv8i16_vv:
+define <vscale x 1 x i16> @clmul_nxv1i16_vx(<vscale x 1 x i16> %va, i16 %b) nounwind {
+; RV32V-LABEL: clmul_nxv1i16_vx:
 ; RV32V:       # %bb.0:
-; RV32V-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; RV32V-NEXT:    vand.vi v12, v10, 2
-; RV32V-NEXT:    vand.vi v14, v10, 1
-; RV32V-NEXT:    vmul.vv v12, v8, v12
-; RV32V-NEXT:    vmul.vv v14, v8, v14
-; RV32V-NEXT:    vxor.vv v12, v14, v12
-; RV32V-NEXT:    vand.vi v14, v10, 4
-; RV32V-NEXT:    vmul.vv v14, v8, v14
-; RV32V-NEXT:    vxor.vv v12, v12, v14
-; RV32V-NEXT:    vand.vi v14, v10, 8
+; RV32V-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; RV32V-NEXT:    vmv.v.x v9, a0
 ; RV32V-NEXT:    li a0, 16
-; RV32V-NEXT:    vmul.vv v14, v8, v14
-; RV32V-NEXT:    vxor.vv v12, v12, v14
-; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    vand.vi v10, v9, 2
+; RV32V-NEXT:    vand.vi v11, v9, 1
+; RV32V-NEXT:    vmul.vv v10, v8, v10
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v11, v10
+; RV32V-NEXT:    vand.vi v11, v9, 4
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vi v11, v9, 8
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
 ; RV32V-NEXT:    li a0, 32
-; RV32V-NEXT:    vmul.vv v14, v8, v14
-; RV32V-NEXT:    vxor.vv v12, v12, v14
-; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
 ; RV32V-NEXT:    li a0, 64
-; RV32V-NEXT:    vmul.vv v14, v8, v14
-; RV32V-NEXT:    vxor.vv v12, v12, v14
-; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
 ; RV32V-NEXT:    li a0, 128
-; RV32V-NEXT:    vmul.vv v14, v8, v14
-; RV32V-NEXT:    vxor.vv v12, v12, v14
-; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
 ; RV32V-NEXT:    li a0, 256
-; RV32V-NEXT:    vmul.vv v14, v8, v14
-; RV32V-NEXT:    vxor.vv v12, v12, v14
-; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
 ; RV32V-NEXT:    li a0, 512
-; RV32V-NEXT:    vmul.vv v14, v8, v14
-; RV32V-NEXT:    vxor.vv v12, v12, v14
-; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
 ; RV32V-NEXT:    li a0, 1024
-; RV32V-NEXT:    vmul.vv v14, v8, v14
-; RV32V-NEXT:    vxor.vv v12, v12, v14
-; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
 ; RV32V-NEXT:    li a0, 1
 ; RV32V-NEXT:    slli a0, a0, 11
-; RV32V-NEXT:    vmul.vv v14, v8, v14
-; RV32V-NEXT:    vxor.vv v12, v12, v14
-; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
 ; RV32V-NEXT:    lui a0, 1
-; RV32V-NEXT:    vmul.vv v14, v8, v14
-; RV32V-NEXT:    vxor.vv v12, v12, v14
-; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
 ; RV32V-NEXT:    lui a0, 2
-; RV32V-NEXT:    vmul.vv v14, v8, v14
-; RV32V-NEXT:    vxor.vv v12, v12, v14
-; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
 ; RV32V-NEXT:    lui a0, 4
-; RV32V-NEXT:    vmul.vv v14, v8, v14
-; RV32V-NEXT:    vxor.vv v12, v12, v14
-; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
 ; RV32V-NEXT:    lui a0, 8
-; RV32V-NEXT:    vand.vx v10, v10, a0
-; RV32V-NEXT:    vmul.vv v14, v8, v14
-; RV32V-NEXT:    vxor.vv v12, v12, v14
-; RV32V-NEXT:    vmul.vv v8, v8, v10
-; RV32V-NEXT:    vxor.vv v8, v12, v8
+; RV32V-NEXT:    vand.vx v9, v9, a0
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vmul.vv v8, v8, v9
+; RV32V-NEXT:    vxor.vv v8, v10, v8
 ; RV32V-NEXT:    ret
 ;
-; RV64V-LABEL: clmul_nxv8i16_vv:
+; RV64V-LABEL: clmul_nxv1i16_vx:
 ; RV64V:       # %bb.0:
-; RV64V-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; RV64V-NEXT:    vand.vi v12, v10, 2
-; RV64V-NEXT:    vand.vi v14, v10, 1
-; RV64V-NEXT:    vmul.vv v12, v8, v12
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v14, v12
-; RV64V-NEXT:    vand.vi v14, v10, 4
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vi v14, v10, 8
+; RV64V-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; RV64V-NEXT:    vmv.v.x v9, a0
 ; RV64V-NEXT:    li a0, 16
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    vand.vi v10, v9, 2
+; RV64V-NEXT:    vand.vi v11, v9, 1
+; RV64V-NEXT:    vmul.vv v10, v8, v10
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v11, v10
+; RV64V-NEXT:    vand.vi v11, v9, 4
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vi v11, v9, 8
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
 ; RV64V-NEXT:    li a0, 32
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
-; RV64V-NEXT:    li a0, 64
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 64
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
 ; RV64V-NEXT:    li a0, 128
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
 ; RV64V-NEXT:    li a0, 256
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
 ; RV64V-NEXT:    li a0, 512
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
 ; RV64V-NEXT:    li a0, 1024
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
 ; RV64V-NEXT:    li a0, 1
 ; RV64V-NEXT:    slli a0, a0, 11
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
 ; RV64V-NEXT:    lui a0, 1
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
 ; RV64V-NEXT:    lui a0, 2
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
 ; RV64V-NEXT:    lui a0, 4
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
 ; RV64V-NEXT:    lui a0, 8
-; RV64V-NEXT:    vand.vx v10, v10, a0
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vmul.vv v8, v8, v10
-; RV64V-NEXT:    vxor.vv v8, v12, v8
+; RV64V-NEXT:    vand.vx v9, v9, a0
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vmul.vv v8, v8, v9
+; RV64V-NEXT:    vxor.vv v8, v10, v8
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC64-LABEL: clmul_nxv8i16_vv:
+; RV32ZVBC64-LABEL: clmul_nxv1i16_vx:
 ; RV32ZVBC64:       # %bb.0:
-; RV32ZVBC64-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV32ZVBC64-NEXT:    vzext.vf4 v16, v10
-; RV32ZVBC64-NEXT:    vzext.vf4 v24, v8
-; RV32ZVBC64-NEXT:    vclmul.vv v8, v24, v16
-; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; RV32ZVBC64-NEXT:    vnsrl.wi v16, v8, 0
-; RV32ZVBC64-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV32ZVBC64-NEXT:    vnsrl.wi v8, v16, 0
+; RV32ZVBC64-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; RV32ZVBC64-NEXT:    vmv.v.x v9, a0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf4 v10, v8
+; RV32ZVBC64-NEXT:    vzext.vf4 v8, v9
+; RV32ZVBC64-NEXT:    vclmul.vv v8, v10, v8
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v8, 0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v8, 0
 ; RV32ZVBC64-NEXT:    ret
 ;
-; RV64ZVBC64-LABEL: clmul_nxv8i16_vv:
+; RV64ZVBC64-LABEL: clmul_nxv1i16_vx:
 ; RV64ZVBC64:       # %bb.0:
-; RV64ZVBC64-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV64ZVBC64-NEXT:    vzext.vf4 v16, v10
-; RV64ZVBC64-NEXT:    vzext.vf4 v24, v8
-; RV64ZVBC64-NEXT:    vclmul.vv v8, v24, v16
-; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; RV64ZVBC64-NEXT:    vnsrl.wi v16, v8, 0
-; RV64ZVBC64-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV64ZVBC64-NEXT:    vnsrl.wi v8, v16, 0
+; RV64ZVBC64-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; RV64ZVBC64-NEXT:    vmv.v.x v9, a0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf4 v10, v8
+; RV64ZVBC64-NEXT:    vzext.vf4 v8, v9
+; RV64ZVBC64-NEXT:    vclmul.vv v8, v10, v8
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v8, 0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v8, 0
 ; RV64ZVBC64-NEXT:    ret
 ;
-; RV32ZVBC32-LABEL: clmul_nxv8i16_vv:
+; RV32ZVBC32-LABEL: clmul_nxv1i16_vx:
 ; RV32ZVBC32:       # %bb.0:
-; RV32ZVBC32-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; RV32ZVBC32-NEXT:    vand.vi v12, v10, 2
-; RV32ZVBC32-NEXT:    vand.vi v14, v10, 1
-; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v12
-; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v14, v12
-; RV32ZVBC32-NEXT:    vand.vi v14, v10, 4
-; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vi v14, v10, 8
-; RV32ZVBC32-NEXT:    li a0, 16
-; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV32ZVBC32-NEXT:    li a0, 32
-; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV32ZVBC32-NEXT:    li a0, 64
-; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV32ZVBC32-NEXT:    li a0, 128
-; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV32ZVBC32-NEXT:    li a0, 256
-; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV32ZVBC32-NEXT:    li a0, 512
-; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV32ZVBC32-NEXT:    li a0, 1024
-; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV32ZVBC32-NEXT:    li a0, 1
-; RV32ZVBC32-NEXT:    slli a0, a0, 11
-; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV32ZVBC32-NEXT:    lui a0, 1
-; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV32ZVBC32-NEXT:    lui a0, 2
-; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV32ZVBC32-NEXT:    lui a0, 4
-; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV32ZVBC32-NEXT:    lui a0, 8
-; RV32ZVBC32-NEXT:    vand.vx v10, v10, a0
-; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vmul.vv v8, v8, v10
-; RV32ZVBC32-NEXT:    vxor.vv v8, v12, v8
+; RV32ZVBC32-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; RV32ZVBC32-NEXT:    vclmul.vx v8, v8, a0
 ; RV32ZVBC32-NEXT:    ret
 ;
-; RV64ZVBC32-LABEL: clmul_nxv8i16_vv:
+; RV64ZVBC32-LABEL: clmul_nxv1i16_vx:
 ; RV64ZVBC32:       # %bb.0:
-; RV64ZVBC32-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; RV64ZVBC32-NEXT:    vand.vi v12, v10, 2
-; RV64ZVBC32-NEXT:    vand.vi v14, v10, 1
-; RV64ZVBC32-NEXT:    vmul.vv v12, v8, v12
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v14, v12
-; RV64ZVBC32-NEXT:    vand.vi v14, v10, 4
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vi v14, v10, 8
-; RV64ZVBC32-NEXT:    li a0, 16
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    li a0, 32
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    li a0, 64
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    li a0, 128
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    li a0, 256
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    li a0, 512
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    li a0, 1024
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    li a0, 1
-; RV64ZVBC32-NEXT:    slli a0, a0, 11
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    lui a0, 1
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    lui a0, 2
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    lui a0, 4
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    lui a0, 8
-; RV64ZVBC32-NEXT:    vand.vx v10, v10, a0
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vmul.vv v8, v8, v10
-; RV64ZVBC32-NEXT:    vxor.vv v8, v12, v8
+; RV64ZVBC32-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; RV64ZVBC32-NEXT:    vclmul.vx v8, v8, a0
 ; RV64ZVBC32-NEXT:    ret
-  %v = call <vscale x 8 x i16> @llvm.clmul.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %vb)
-  ret <vscale x 8 x i16> %v
+  %elt.head = insertelement <vscale x 1 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i16> %elt.head, <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i16> @llvm.clmul.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %vb)
+  ret <vscale x 1 x i16> %v
 }
 
-define <vscale x 8 x i16> @clmul_nxv8i16_vx(<vscale x 8 x i16> %va, i16 %b) nounwind {
-; RV32V-LABEL: clmul_nxv8i16_vx:
+define <vscale x 2 x i16> @clmul_nxv2i16_vv(<vscale x 2 x i16> %va, <vscale x 2 x i16> %vb) nounwind {
+; RV32V-LABEL: clmul_nxv2i16_vv:
 ; RV32V:       # %bb.0:
-; RV32V-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; RV32V-NEXT:    vmv.v.x v10, a0
+; RV32V-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; RV32V-NEXT:    vand.vi v10, v9, 2
+; RV32V-NEXT:    vand.vi v11, v9, 1
+; RV32V-NEXT:    vmul.vv v10, v8, v10
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v11, v10
+; RV32V-NEXT:    vand.vi v11, v9, 4
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vi v11, v9, 8
 ; RV32V-NEXT:    li a0, 16
-; RV32V-NEXT:    vand.vi v12, v10, 2
-; RV32V-NEXT:    vand.vi v14, v10, 1
-; RV32V-NEXT:    vmul.vv v12, v8, v12
-; RV32V-NEXT:    vmul.vv v14, v8, v14
-; RV32V-NEXT:    vxor.vv v12, v14, v12
-; RV32V-NEXT:    vand.vi v14, v10, 4
-; RV32V-NEXT:    vmul.vv v14, v8, v14
-; RV32V-NEXT:    vxor.vv v12, v12, v14
-; RV32V-NEXT:    vand.vi v14, v10, 8
-; RV32V-NEXT:    vmul.vv v14, v8, v14
-; RV32V-NEXT:    vxor.vv v12, v12, v14
-; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
 ; RV32V-NEXT:    li a0, 32
-; RV32V-NEXT:    vmul.vv v14, v8, v14
-; RV32V-NEXT:    vxor.vv v12, v12, v14
-; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
 ; RV32V-NEXT:    li a0, 64
-; RV32V-NEXT:    vmul.vv v14, v8, v14
-; RV32V-NEXT:    vxor.vv v12, v12, v14
-; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
 ; RV32V-NEXT:    li a0, 128
-; RV32V-NEXT:    vmul.vv v14, v8, v14
-; RV32V-NEXT:    vxor.vv v12, v12, v14
-; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
 ; RV32V-NEXT:    li a0, 256
-; RV32V-NEXT:    vmul.vv v14, v8, v14
-; RV32V-NEXT:    vxor.vv v12, v12, v14
-; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
 ; RV32V-NEXT:    li a0, 512
-; RV32V-NEXT:    vmul.vv v14, v8, v14
-; RV32V-NEXT:    vxor.vv v12, v12, v14
-; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
 ; RV32V-NEXT:    li a0, 1024
-; RV32V-NEXT:    vmul.vv v14, v8, v14
-; RV32V-NEXT:    vxor.vv v12, v12, v14
-; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
 ; RV32V-NEXT:    li a0, 1
 ; RV32V-NEXT:    slli a0, a0, 11
-; RV32V-NEXT:    vmul.vv v14, v8, v14
-; RV32V-NEXT:    vxor.vv v12, v12, v14
-; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
 ; RV32V-NEXT:    lui a0, 1
-; RV32V-NEXT:    vmul.vv v14, v8, v14
-; RV32V-NEXT:    vxor.vv v12, v12, v14
-; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
 ; RV32V-NEXT:    lui a0, 2
-; RV32V-NEXT:    vmul.vv v14, v8, v14
-; RV32V-NEXT:    vxor.vv v12, v12, v14
-; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
 ; RV32V-NEXT:    lui a0, 4
-; RV32V-NEXT:    vmul.vv v14, v8, v14
-; RV32V-NEXT:    vxor.vv v12, v12, v14
-; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
 ; RV32V-NEXT:    lui a0, 8
-; RV32V-NEXT:    vand.vx v10, v10, a0
-; RV32V-NEXT:    vmul.vv v14, v8, v14
-; RV32V-NEXT:    vxor.vv v12, v12, v14
-; RV32V-NEXT:    vmul.vv v8, v8, v10
-; RV32V-NEXT:    vxor.vv v8, v12, v8
+; RV32V-NEXT:    vand.vx v9, v9, a0
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vmul.vv v8, v8, v9
+; RV32V-NEXT:    vxor.vv v8, v10, v8
 ; RV32V-NEXT:    ret
 ;
-; RV64V-LABEL: clmul_nxv8i16_vx:
+; RV64V-LABEL: clmul_nxv2i16_vv:
 ; RV64V:       # %bb.0:
-; RV64V-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; RV64V-NEXT:    vmv.v.x v10, a0
+; RV64V-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; RV64V-NEXT:    vand.vi v10, v9, 2
+; RV64V-NEXT:    vand.vi v11, v9, 1
+; RV64V-NEXT:    vmul.vv v10, v8, v10
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v11, v10
+; RV64V-NEXT:    vand.vi v11, v9, 4
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vi v11, v9, 8
 ; RV64V-NEXT:    li a0, 16
-; RV64V-NEXT:    vand.vi v12, v10, 2
-; RV64V-NEXT:    vand.vi v14, v10, 1
-; RV64V-NEXT:    vmul.vv v12, v8, v12
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v14, v12
-; RV64V-NEXT:    vand.vi v14, v10, 4
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vi v14, v10, 8
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
 ; RV64V-NEXT:    li a0, 32
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
 ; RV64V-NEXT:    li a0, 64
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
 ; RV64V-NEXT:    li a0, 128
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
 ; RV64V-NEXT:    li a0, 256
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
 ; RV64V-NEXT:    li a0, 512
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
 ; RV64V-NEXT:    li a0, 1024
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
 ; RV64V-NEXT:    li a0, 1
 ; RV64V-NEXT:    slli a0, a0, 11
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
 ; RV64V-NEXT:    lui a0, 1
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
 ; RV64V-NEXT:    lui a0, 2
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
 ; RV64V-NEXT:    lui a0, 4
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
 ; RV64V-NEXT:    lui a0, 8
-; RV64V-NEXT:    vand.vx v10, v10, a0
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vmul.vv v8, v8, v10
-; RV64V-NEXT:    vxor.vv v8, v12, v8
+; RV64V-NEXT:    vand.vx v9, v9, a0
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vmul.vv v8, v8, v9
+; RV64V-NEXT:    vxor.vv v8, v10, v8
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC64-LABEL: clmul_nxv8i16_vx:
+; RV32ZVBC64-LABEL: clmul_nxv2i16_vv:
 ; RV32ZVBC64:       # %bb.0:
-; RV32ZVBC64-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; RV32ZVBC64-NEXT:    vmv.v.x v24, a0
-; RV32ZVBC64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV32ZVBC64-NEXT:    vzext.vf4 v16, v8
-; RV32ZVBC64-NEXT:    vzext.vf4 v8, v24
-; RV32ZVBC64-NEXT:    vclmul.vv v8, v16, v8
-; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; RV32ZVBC64-NEXT:    vnsrl.wi v16, v8, 0
-; RV32ZVBC64-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV32ZVBC64-NEXT:    vnsrl.wi v8, v16, 0
+; RV32ZVBC64-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf4 v10, v9
+; RV32ZVBC64-NEXT:    vzext.vf4 v12, v8
+; RV32ZVBC64-NEXT:    vclmul.vv v8, v12, v10
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v10, v8, 0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v10, 0
 ; RV32ZVBC64-NEXT:    ret
 ;
-; RV64ZVBC64-LABEL: clmul_nxv8i16_vx:
+; RV64ZVBC64-LABEL: clmul_nxv2i16_vv:
 ; RV64ZVBC64:       # %bb.0:
-; RV64ZVBC64-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; RV64ZVBC64-NEXT:    vmv.v.x v24, a0
-; RV64ZVBC64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV64ZVBC64-NEXT:    vzext.vf4 v16, v8
-; RV64ZVBC64-NEXT:    vzext.vf4 v8, v24
-; RV64ZVBC64-NEXT:    vclmul.vv v8, v16, v8
-; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; RV64ZVBC64-NEXT:    vnsrl.wi v16, v8, 0
-; RV64ZVBC64-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV64ZVBC64-NEXT:    vnsrl.wi v8, v16, 0
+; RV64ZVBC64-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf4 v10, v9
+; RV64ZVBC64-NEXT:    vzext.vf4 v12, v8
+; RV64ZVBC64-NEXT:    vclmul.vv v8, v12, v10
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v10, v8, 0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v10, 0
 ; RV64ZVBC64-NEXT:    ret
 ;
-; RV32ZVBC32-LABEL: clmul_nxv8i16_vx:
+; RV32ZVBC32-LABEL: clmul_nxv2i16_vv:
 ; RV32ZVBC32:       # %bb.0:
-; RV32ZVBC32-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; RV32ZVBC32-NEXT:    vmv.v.x v10, a0
-; RV32ZVBC32-NEXT:    li a0, 16
-; RV32ZVBC32-NEXT:    vand.vi v12, v10, 2
-; RV32ZVBC32-NEXT:    vand.vi v14, v10, 1
-; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v12
-; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v14, v12
-; RV32ZVBC32-NEXT:    vand.vi v14, v10, 4
-; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vi v14, v10, 8
-; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV32ZVBC32-NEXT:    li a0, 32
-; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV32ZVBC32-NEXT:    li a0, 64
-; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV32ZVBC32-NEXT:    li a0, 128
-; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV32ZVBC32-NEXT:    li a0, 256
-; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV32ZVBC32-NEXT:    li a0, 512
-; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV32ZVBC32-NEXT:    li a0, 1024
-; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV32ZVBC32-NEXT:    li a0, 1
-; RV32ZVBC32-NEXT:    slli a0, a0, 11
-; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV32ZVBC32-NEXT:    lui a0, 1
-; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV32ZVBC32-NEXT:    lui a0, 2
-; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV32ZVBC32-NEXT:    lui a0, 4
-; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV32ZVBC32-NEXT:    lui a0, 8
-; RV32ZVBC32-NEXT:    vand.vx v10, v10, a0
-; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vmul.vv v8, v8, v10
-; RV32ZVBC32-NEXT:    vxor.vv v8, v12, v8
+; RV32ZVBC32-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; RV32ZVBC32-NEXT:    vclmul.vv v8, v8, v9
 ; RV32ZVBC32-NEXT:    ret
 ;
-; RV64ZVBC32-LABEL: clmul_nxv8i16_vx:
+; RV64ZVBC32-LABEL: clmul_nxv2i16_vv:
 ; RV64ZVBC32:       # %bb.0:
-; RV64ZVBC32-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; RV64ZVBC32-NEXT:    vmv.v.x v10, a0
-; RV64ZVBC32-NEXT:    li a0, 16
-; RV64ZVBC32-NEXT:    vand.vi v12, v10, 2
-; RV64ZVBC32-NEXT:    vand.vi v14, v10, 1
-; RV64ZVBC32-NEXT:    vmul.vv v12, v8, v12
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v14, v12
-; RV64ZVBC32-NEXT:    vand.vi v14, v10, 4
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vi v14, v10, 8
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    li a0, 32
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    li a0, 64
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    li a0, 128
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    li a0, 256
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    li a0, 512
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    li a0, 1024
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    li a0, 1
-; RV64ZVBC32-NEXT:    slli a0, a0, 11
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    lui a0, 1
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    lui a0, 2
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    lui a0, 4
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    lui a0, 8
-; RV64ZVBC32-NEXT:    vand.vx v10, v10, a0
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vmul.vv v8, v8, v10
-; RV64ZVBC32-NEXT:    vxor.vv v8, v12, v8
+; RV64ZVBC32-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; RV64ZVBC32-NEXT:    vclmul.vv v8, v8, v9
 ; RV64ZVBC32-NEXT:    ret
-  %elt.head = insertelement <vscale x 8 x i16> poison, i16 %b, i32 0
-  %vb = shufflevector <vscale x 8 x i16> %elt.head, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
-  %v = call <vscale x 8 x i16> @llvm.clmul.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %vb)
-  ret <vscale x 8 x i16> %v
-}
-
-define <vscale x 16 x i16> @clmul_nxv16i16_vv(<vscale x 16 x i16> %va, <vscale x 16 x i16> %vb) nounwind {
-; CHECK-LABEL: clmul_nxv16i16_vv:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vand.vi v16, v12, 2
-; CHECK-NEXT:    vand.vi v20, v12, 1
-; CHECK-NEXT:    vmul.vv v16, v8, v16
-; CHECK-NEXT:    vmul.vv v20, v8, v20
-; CHECK-NEXT:    vxor.vv v16, v20, v16
-; CHECK-NEXT:    vand.vi v20, v12, 4
-; CHECK-NEXT:    vmul.vv v20, v8, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vi v20, v12, 8
-; CHECK-NEXT:    li a0, 16
-; CHECK-NEXT:    vmul.vv v20, v8, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vx v20, v12, a0
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vmul.vv v20, v8, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vx v20, v12, a0
-; CHECK-NEXT:    li a0, 64
-; CHECK-NEXT:    vmul.vv v20, v8, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vx v20, v12, a0
-; CHECK-NEXT:    li a0, 128
-; CHECK-NEXT:    vmul.vv v20, v8, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vx v20, v12, a0
-; CHECK-NEXT:    li a0, 256
-; CHECK-NEXT:    vmul.vv v20, v8, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vx v20, v12, a0
-; CHECK-NEXT:    li a0, 512
-; CHECK-NEXT:    vmul.vv v20, v8, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vx v20, v12, a0
-; CHECK-NEXT:    li a0, 1024
-; CHECK-NEXT:    vmul.vv v20, v8, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vx v20, v12, a0
-; CHECK-NEXT:    li a0, 1
-; CHECK-NEXT:    slli a0, a0, 11
-; CHECK-NEXT:    vmul.vv v20, v8, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vx v20, v12, a0
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    vmul.vv v20, v8, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vx v20, v12, a0
-; CHECK-NEXT:    lui a0, 2
-; CHECK-NEXT:    vmul.vv v20, v8, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vx v20, v12, a0
-; CHECK-NEXT:    lui a0, 4
-; CHECK-NEXT:    vmul.vv v20, v8, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vx v20, v12, a0
-; CHECK-NEXT:    lui a0, 8
-; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vmul.vv v20, v8, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vmul.vv v8, v8, v12
-; CHECK-NEXT:    vxor.vv v8, v16, v8
-; CHECK-NEXT:    ret
-  %v = call <vscale x 16 x i16> @llvm.clmul.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %vb)
-  ret <vscale x 16 x i16> %v
-}
-
-define <vscale x 16 x i16> @clmul_nxv16i16_vx(<vscale x 16 x i16> %va, i16 %b) nounwind {
-; CHECK-LABEL: clmul_nxv16i16_vx:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vmv.v.x v12, a0
-; CHECK-NEXT:    li a0, 16
-; CHECK-NEXT:    vand.vi v16, v12, 2
-; CHECK-NEXT:    vand.vi v20, v12, 1
-; CHECK-NEXT:    vmul.vv v16, v8, v16
-; CHECK-NEXT:    vmul.vv v20, v8, v20
-; CHECK-NEXT:    vxor.vv v16, v20, v16
-; CHECK-NEXT:    vand.vi v20, v12, 4
-; CHECK-NEXT:    vmul.vv v20, v8, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vi v20, v12, 8
-; CHECK-NEXT:    vmul.vv v20, v8, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vx v20, v12, a0
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vmul.vv v20, v8, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vx v20, v12, a0
-; CHECK-NEXT:    li a0, 64
-; CHECK-NEXT:    vmul.vv v20, v8, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vx v20, v12, a0
-; CHECK-NEXT:    li a0, 128
-; CHECK-NEXT:    vmul.vv v20, v8, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vx v20, v12, a0
-; CHECK-NEXT:    li a0, 256
-; CHECK-NEXT:    vmul.vv v20, v8, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vx v20, v12, a0
-; CHECK-NEXT:    li a0, 512
-; CHECK-NEXT:    vmul.vv v20, v8, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vx v20, v12, a0
-; CHECK-NEXT:    li a0, 1024
-; CHECK-NEXT:    vmul.vv v20, v8, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vx v20, v12, a0
-; CHECK-NEXT:    li a0, 1
-; CHECK-NEXT:    slli a0, a0, 11
-; CHECK-NEXT:    vmul.vv v20, v8, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vx v20, v12, a0
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    vmul.vv v20, v8, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vx v20, v12, a0
-; CHECK-NEXT:    lui a0, 2
-; CHECK-NEXT:    vmul.vv v20, v8, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vx v20, v12, a0
-; CHECK-NEXT:    lui a0, 4
-; CHECK-NEXT:    vmul.vv v20, v8, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vx v20, v12, a0
-; CHECK-NEXT:    lui a0, 8
-; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vmul.vv v20, v8, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vmul.vv v8, v8, v12
-; CHECK-NEXT:    vxor.vv v8, v16, v8
-; CHECK-NEXT:    ret
-  %elt.head = insertelement <vscale x 16 x i16> poison, i16 %b, i32 0
-  %vb = shufflevector <vscale x 16 x i16> %elt.head, <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer
-  %v = call <vscale x 16 x i16> @llvm.clmul.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %vb)
-  ret <vscale x 16 x i16> %v
-}
-
-define <vscale x 32 x i16> @clmul_nxv32i16_vv(<vscale x 32 x i16> %va, <vscale x 32 x i16> %vb) nounwind {
-; CHECK-LABEL: clmul_nxv32i16_vv:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vand.vi v24, v16, 2
-; CHECK-NEXT:    vand.vi v0, v16, 1
-; CHECK-NEXT:    vmul.vv v24, v8, v24
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v0, v24
-; CHECK-NEXT:    vand.vi v0, v16, 4
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vi v0, v16, 8
-; CHECK-NEXT:    li a0, 16
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    li a0, 64
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    li a0, 128
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    li a0, 256
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    li a0, 512
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    li a0, 1024
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    li a0, 1
-; CHECK-NEXT:    slli a0, a0, 11
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    lui a0, 2
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    lui a0, 4
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    lui a0, 8
-; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vmul.vv v8, v8, v16
-; CHECK-NEXT:    vxor.vv v8, v24, v8
-; CHECK-NEXT:    ret
-  %v = call <vscale x 32 x i16> @llvm.clmul.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %vb)
-  ret <vscale x 32 x i16> %v
-}
-
-define <vscale x 32 x i16> @clmul_nxv32i16_vx(<vscale x 32 x i16> %va, i16 %b) nounwind {
-; CHECK-LABEL: clmul_nxv32i16_vx:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vmv.v.x v16, a0
-; CHECK-NEXT:    li a0, 16
-; CHECK-NEXT:    vand.vi v24, v16, 2
-; CHECK-NEXT:    vand.vi v0, v16, 1
-; CHECK-NEXT:    vmul.vv v24, v8, v24
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v0, v24
-; CHECK-NEXT:    vand.vi v0, v16, 4
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vi v0, v16, 8
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    li a0, 64
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    li a0, 128
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    li a0, 256
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    li a0, 512
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    li a0, 1024
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    li a0, 1
-; CHECK-NEXT:    slli a0, a0, 11
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    lui a0, 2
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    lui a0, 4
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    lui a0, 8
-; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vmul.vv v8, v8, v16
-; CHECK-NEXT:    vxor.vv v8, v24, v8
-; CHECK-NEXT:    ret
-  %elt.head = insertelement <vscale x 32 x i16> poison, i16 %b, i32 0
-  %vb = shufflevector <vscale x 32 x i16> %elt.head, <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer
-  %v = call <vscale x 32 x i16> @llvm.clmul.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %vb)
-  ret <vscale x 32 x i16> %v
+  %v = call <vscale x 2 x i16> @llvm.clmul.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %vb)
+  ret <vscale x 2 x i16> %v
 }
 
-define <vscale x 1 x i32> @clmul_nxv1i32_vv(<vscale x 1 x i32> %va, <vscale x 1 x i32> %vb) nounwind {
-; RV32V-LABEL: clmul_nxv1i32_vv:
+define <vscale x 2 x i16> @clmul_nxv2i16_vx(<vscale x 2 x i16> %va, i16 %b) nounwind {
+; RV32V-LABEL: clmul_nxv2i16_vx:
 ; RV32V:       # %bb.0:
-; RV32V-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; RV32V-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; RV32V-NEXT:    vmv.v.x v9, a0
+; RV32V-NEXT:    li a0, 16
 ; RV32V-NEXT:    vand.vi v10, v9, 2
 ; RV32V-NEXT:    vand.vi v11, v9, 1
 ; RV32V-NEXT:    vmul.vv v10, v8, v10
@@ -4190,7 +2317,6 @@ define <vscale x 1 x i32> @clmul_nxv1i32_vv(<vscale x 1 x i32> %va, <vscale x 1
 ; RV32V-NEXT:    vmul.vv v11, v8, v11
 ; RV32V-NEXT:    vxor.vv v10, v10, v11
 ; RV32V-NEXT:    vand.vi v11, v9, 8
-; RV32V-NEXT:    li a0, 16
 ; RV32V-NEXT:    vmul.vv v11, v8, v11
 ; RV32V-NEXT:    vxor.vv v10, v10, v11
 ; RV32V-NEXT:    vand.vx v11, v9, a0
@@ -4236,70 +2362,183 @@ define <vscale x 1 x i32> @clmul_nxv1i32_vv(<vscale x 1 x i32> %va, <vscale x 1
 ; RV32V-NEXT:    vxor.vv v10, v10, v11
 ; RV32V-NEXT:    vand.vx v11, v9, a0
 ; RV32V-NEXT:    lui a0, 8
+; RV32V-NEXT:    vand.vx v9, v9, a0
 ; RV32V-NEXT:    vmul.vv v11, v8, v11
 ; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
-; RV32V-NEXT:    lui a0, 16
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
-; RV32V-NEXT:    lui a0, 32
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
-; RV32V-NEXT:    lui a0, 64
+; RV32V-NEXT:    vmul.vv v8, v8, v9
+; RV32V-NEXT:    vxor.vv v8, v10, v8
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: clmul_nxv2i16_vx:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; RV64V-NEXT:    vmv.v.x v9, a0
+; RV64V-NEXT:    li a0, 16
+; RV64V-NEXT:    vand.vi v10, v9, 2
+; RV64V-NEXT:    vand.vi v11, v9, 1
+; RV64V-NEXT:    vmul.vv v10, v8, v10
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v11, v10
+; RV64V-NEXT:    vand.vi v11, v9, 4
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vi v11, v9, 8
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 32
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 64
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 128
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 256
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 512
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 1024
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 1
+; RV64V-NEXT:    slli a0, a0, 11
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 1
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 2
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 4
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 8
+; RV64V-NEXT:    vand.vx v9, v9, a0
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vmul.vv v8, v8, v9
+; RV64V-NEXT:    vxor.vv v8, v10, v8
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC64-LABEL: clmul_nxv2i16_vx:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; RV32ZVBC64-NEXT:    vmv.v.x v12, a0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf4 v10, v8
+; RV32ZVBC64-NEXT:    vzext.vf4 v8, v12
+; RV32ZVBC64-NEXT:    vclmul.vv v8, v10, v8
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v10, v8, 0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v10, 0
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmul_nxv2i16_vx:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; RV64ZVBC64-NEXT:    vmv.v.x v12, a0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf4 v10, v8
+; RV64ZVBC64-NEXT:    vzext.vf4 v8, v12
+; RV64ZVBC64-NEXT:    vclmul.vv v8, v10, v8
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v10, v8, 0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v10, 0
+; RV64ZVBC64-NEXT:    ret
+;
+; RV32ZVBC32-LABEL: clmul_nxv2i16_vx:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; RV32ZVBC32-NEXT:    vclmul.vx v8, v8, a0
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmul_nxv2i16_vx:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; RV64ZVBC32-NEXT:    vclmul.vx v8, v8, a0
+; RV64ZVBC32-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i16> %elt.head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i16> @llvm.clmul.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %vb)
+  ret <vscale x 2 x i16> %v
+}
+
+define <vscale x 4 x i16> @clmul_nxv4i16_vv(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb) nounwind {
+; RV32V-LABEL: clmul_nxv4i16_vv:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; RV32V-NEXT:    vand.vi v10, v9, 2
+; RV32V-NEXT:    vand.vi v11, v9, 1
+; RV32V-NEXT:    vmul.vv v10, v8, v10
 ; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
-; RV32V-NEXT:    lui a0, 128
+; RV32V-NEXT:    vxor.vv v10, v11, v10
+; RV32V-NEXT:    vand.vi v11, v9, 4
 ; RV32V-NEXT:    vmul.vv v11, v8, v11
 ; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
-; RV32V-NEXT:    lui a0, 256
+; RV32V-NEXT:    vand.vi v11, v9, 8
+; RV32V-NEXT:    li a0, 16
 ; RV32V-NEXT:    vmul.vv v11, v8, v11
 ; RV32V-NEXT:    vxor.vv v10, v10, v11
 ; RV32V-NEXT:    vand.vx v11, v9, a0
-; RV32V-NEXT:    lui a0, 512
+; RV32V-NEXT:    li a0, 32
 ; RV32V-NEXT:    vmul.vv v11, v8, v11
 ; RV32V-NEXT:    vxor.vv v10, v10, v11
 ; RV32V-NEXT:    vand.vx v11, v9, a0
-; RV32V-NEXT:    lui a0, 1024
+; RV32V-NEXT:    li a0, 64
 ; RV32V-NEXT:    vmul.vv v11, v8, v11
 ; RV32V-NEXT:    vxor.vv v10, v10, v11
 ; RV32V-NEXT:    vand.vx v11, v9, a0
-; RV32V-NEXT:    lui a0, 2048
+; RV32V-NEXT:    li a0, 128
 ; RV32V-NEXT:    vmul.vv v11, v8, v11
 ; RV32V-NEXT:    vxor.vv v10, v10, v11
 ; RV32V-NEXT:    vand.vx v11, v9, a0
-; RV32V-NEXT:    lui a0, 4096
+; RV32V-NEXT:    li a0, 256
 ; RV32V-NEXT:    vmul.vv v11, v8, v11
 ; RV32V-NEXT:    vxor.vv v10, v10, v11
 ; RV32V-NEXT:    vand.vx v11, v9, a0
-; RV32V-NEXT:    lui a0, 8192
+; RV32V-NEXT:    li a0, 512
 ; RV32V-NEXT:    vmul.vv v11, v8, v11
 ; RV32V-NEXT:    vxor.vv v10, v10, v11
 ; RV32V-NEXT:    vand.vx v11, v9, a0
-; RV32V-NEXT:    lui a0, 16384
+; RV32V-NEXT:    li a0, 1024
 ; RV32V-NEXT:    vmul.vv v11, v8, v11
 ; RV32V-NEXT:    vxor.vv v10, v10, v11
 ; RV32V-NEXT:    vand.vx v11, v9, a0
-; RV32V-NEXT:    lui a0, 32768
+; RV32V-NEXT:    li a0, 1
+; RV32V-NEXT:    slli a0, a0, 11
 ; RV32V-NEXT:    vmul.vv v11, v8, v11
 ; RV32V-NEXT:    vxor.vv v10, v10, v11
 ; RV32V-NEXT:    vand.vx v11, v9, a0
-; RV32V-NEXT:    lui a0, 65536
+; RV32V-NEXT:    lui a0, 1
 ; RV32V-NEXT:    vmul.vv v11, v8, v11
 ; RV32V-NEXT:    vxor.vv v10, v10, v11
 ; RV32V-NEXT:    vand.vx v11, v9, a0
-; RV32V-NEXT:    lui a0, 131072
+; RV32V-NEXT:    lui a0, 2
 ; RV32V-NEXT:    vmul.vv v11, v8, v11
 ; RV32V-NEXT:    vxor.vv v10, v10, v11
 ; RV32V-NEXT:    vand.vx v11, v9, a0
-; RV32V-NEXT:    lui a0, 262144
+; RV32V-NEXT:    lui a0, 4
 ; RV32V-NEXT:    vmul.vv v11, v8, v11
 ; RV32V-NEXT:    vxor.vv v10, v10, v11
 ; RV32V-NEXT:    vand.vx v11, v9, a0
-; RV32V-NEXT:    lui a0, 524288
+; RV32V-NEXT:    lui a0, 8
 ; RV32V-NEXT:    vand.vx v9, v9, a0
 ; RV32V-NEXT:    vmul.vv v11, v8, v11
 ; RV32V-NEXT:    vxor.vv v10, v10, v11
@@ -4307,9 +2546,9 @@ define <vscale x 1 x i32> @clmul_nxv1i32_vv(<vscale x 1 x i32> %va, <vscale x 1
 ; RV32V-NEXT:    vxor.vv v8, v10, v8
 ; RV32V-NEXT:    ret
 ;
-; RV64V-LABEL: clmul_nxv1i32_vv:
+; RV64V-LABEL: clmul_nxv4i16_vv:
 ; RV64V:       # %bb.0:
-; RV64V-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; RV64V-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; RV64V-NEXT:    vand.vi v10, v9, 2
 ; RV64V-NEXT:    vand.vi v11, v9, 1
 ; RV64V-NEXT:    vmul.vv v10, v8, v10
@@ -4365,70 +2604,178 @@ define <vscale x 1 x i32> @clmul_nxv1i32_vv(<vscale x 1 x i32> %va, <vscale x 1
 ; RV64V-NEXT:    vxor.vv v10, v10, v11
 ; RV64V-NEXT:    vand.vx v11, v9, a0
 ; RV64V-NEXT:    lui a0, 8
+; RV64V-NEXT:    vand.vx v9, v9, a0
 ; RV64V-NEXT:    vmul.vv v11, v8, v11
 ; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 16
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 32
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 64
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 128
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 256
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 512
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 1024
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 2048
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 4096
+; RV64V-NEXT:    vmul.vv v8, v8, v9
+; RV64V-NEXT:    vxor.vv v8, v10, v8
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC64-LABEL: clmul_nxv4i16_vv:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf4 v12, v9
+; RV32ZVBC64-NEXT:    vzext.vf4 v16, v8
+; RV32ZVBC64-NEXT:    vclmul.vv v8, v16, v12
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v12, v8, 0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v12, 0
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmul_nxv4i16_vv:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf4 v12, v9
+; RV64ZVBC64-NEXT:    vzext.vf4 v16, v8
+; RV64ZVBC64-NEXT:    vclmul.vv v8, v16, v12
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v12, v8, 0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v12, 0
+; RV64ZVBC64-NEXT:    ret
+;
+; RV32ZVBC32-LABEL: clmul_nxv4i16_vv:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; RV32ZVBC32-NEXT:    vclmul.vv v8, v8, v9
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmul_nxv4i16_vv:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; RV64ZVBC32-NEXT:    vclmul.vv v8, v8, v9
+; RV64ZVBC32-NEXT:    ret
+  %v = call <vscale x 4 x i16> @llvm.clmul.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb)
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 4 x i16> @clmul_nxv4i16_vx(<vscale x 4 x i16> %va, i16 %b) nounwind {
+; RV32V-LABEL: clmul_nxv4i16_vx:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; RV32V-NEXT:    vmv.v.x v9, a0
+; RV32V-NEXT:    li a0, 16
+; RV32V-NEXT:    vand.vi v10, v9, 2
+; RV32V-NEXT:    vand.vi v11, v9, 1
+; RV32V-NEXT:    vmul.vv v10, v8, v10
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v11, v10
+; RV32V-NEXT:    vand.vi v11, v9, 4
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vi v11, v9, 8
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 32
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 64
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 128
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 256
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 512
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 1024
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 1
+; RV32V-NEXT:    slli a0, a0, 11
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 1
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 2
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 4
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 8
+; RV32V-NEXT:    vand.vx v9, v9, a0
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vmul.vv v8, v8, v9
+; RV32V-NEXT:    vxor.vv v8, v10, v8
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: clmul_nxv4i16_vx:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; RV64V-NEXT:    vmv.v.x v9, a0
+; RV64V-NEXT:    li a0, 16
+; RV64V-NEXT:    vand.vi v10, v9, 2
+; RV64V-NEXT:    vand.vi v11, v9, 1
+; RV64V-NEXT:    vmul.vv v10, v8, v10
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v11, v10
+; RV64V-NEXT:    vand.vi v11, v9, 4
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vi v11, v9, 8
 ; RV64V-NEXT:    vmul.vv v11, v8, v11
 ; RV64V-NEXT:    vxor.vv v10, v10, v11
 ; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 8192
+; RV64V-NEXT:    li a0, 32
 ; RV64V-NEXT:    vmul.vv v11, v8, v11
 ; RV64V-NEXT:    vxor.vv v10, v10, v11
 ; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 16384
+; RV64V-NEXT:    li a0, 64
 ; RV64V-NEXT:    vmul.vv v11, v8, v11
 ; RV64V-NEXT:    vxor.vv v10, v10, v11
 ; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 32768
+; RV64V-NEXT:    li a0, 128
 ; RV64V-NEXT:    vmul.vv v11, v8, v11
 ; RV64V-NEXT:    vxor.vv v10, v10, v11
 ; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 65536
+; RV64V-NEXT:    li a0, 256
 ; RV64V-NEXT:    vmul.vv v11, v8, v11
 ; RV64V-NEXT:    vxor.vv v10, v10, v11
 ; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 131072
+; RV64V-NEXT:    li a0, 512
 ; RV64V-NEXT:    vmul.vv v11, v8, v11
 ; RV64V-NEXT:    vxor.vv v10, v10, v11
 ; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 262144
+; RV64V-NEXT:    li a0, 1024
 ; RV64V-NEXT:    vmul.vv v11, v8, v11
 ; RV64V-NEXT:    vxor.vv v10, v10, v11
 ; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 524288
+; RV64V-NEXT:    li a0, 1
+; RV64V-NEXT:    slli a0, a0, 11
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 1
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 2
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 4
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 8
 ; RV64V-NEXT:    vand.vx v9, v9, a0
 ; RV64V-NEXT:    vmul.vv v11, v8, v11
 ; RV64V-NEXT:    vxor.vv v10, v10, v11
@@ -4436,3619 +2783,4473 @@ define <vscale x 1 x i32> @clmul_nxv1i32_vv(<vscale x 1 x i32> %va, <vscale x 1
 ; RV64V-NEXT:    vxor.vv v8, v10, v8
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC64-LABEL: clmul_nxv1i32_vv:
+; RV32ZVBC64-LABEL: clmul_nxv4i16_vx:
 ; RV32ZVBC64:       # %bb.0:
-; RV32ZVBC64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV32ZVBC64-NEXT:    vzext.vf2 v10, v9
-; RV32ZVBC64-NEXT:    vzext.vf2 v9, v8
-; RV32ZVBC64-NEXT:    vclmul.vv v8, v9, v10
-; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; RV32ZVBC64-NEXT:    vnsrl.wi v8, v8, 0
+; RV32ZVBC64-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; RV32ZVBC64-NEXT:    vmv.v.x v16, a0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf4 v12, v8
+; RV32ZVBC64-NEXT:    vzext.vf4 v8, v16
+; RV32ZVBC64-NEXT:    vclmul.vv v8, v12, v8
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v12, v8, 0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v12, 0
 ; RV32ZVBC64-NEXT:    ret
 ;
-; RV64ZVBC64-LABEL: clmul_nxv1i32_vv:
+; RV64ZVBC64-LABEL: clmul_nxv4i16_vx:
 ; RV64ZVBC64:       # %bb.0:
-; RV64ZVBC64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV64ZVBC64-NEXT:    vzext.vf2 v10, v9
-; RV64ZVBC64-NEXT:    vzext.vf2 v9, v8
-; RV64ZVBC64-NEXT:    vclmul.vv v8, v9, v10
-; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; RV64ZVBC64-NEXT:    vnsrl.wi v8, v8, 0
+; RV64ZVBC64-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; RV64ZVBC64-NEXT:    vmv.v.x v16, a0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf4 v12, v8
+; RV64ZVBC64-NEXT:    vzext.vf4 v8, v16
+; RV64ZVBC64-NEXT:    vclmul.vv v8, v12, v8
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v12, v8, 0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v12, 0
 ; RV64ZVBC64-NEXT:    ret
 ;
-; RV32ZVBC32-LABEL: clmul_nxv1i32_vv:
+; RV32ZVBC32-LABEL: clmul_nxv4i16_vx:
 ; RV32ZVBC32:       # %bb.0:
-; RV32ZVBC32-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
-; RV32ZVBC32-NEXT:    vand.vi v10, v9, 2
-; RV32ZVBC32-NEXT:    vand.vi v11, v9, 1
-; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v10
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v11, v10
-; RV32ZVBC32-NEXT:    vand.vi v11, v9, 4
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vi v11, v9, 8
-; RV32ZVBC32-NEXT:    li a0, 16
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 32
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 64
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 128
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 256
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 512
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 1024
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 1
-; RV32ZVBC32-NEXT:    slli a0, a0, 11
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    lui a0, 1
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    lui a0, 2
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    lui a0, 4
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    lui a0, 8
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    lui a0, 16
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    lui a0, 32
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    lui a0, 64
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    lui a0, 128
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    lui a0, 256
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    lui a0, 512
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    lui a0, 1024
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    lui a0, 2048
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    lui a0, 4096
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    lui a0, 8192
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    lui a0, 16384
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    lui a0, 32768
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    lui a0, 65536
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    lui a0, 131072
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    lui a0, 262144
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    lui a0, 524288
-; RV32ZVBC32-NEXT:    vand.vx v9, v9, a0
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vmul.vv v8, v8, v9
-; RV32ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; RV32ZVBC32-NEXT:    vclmul.vx v8, v8, a0
 ; RV32ZVBC32-NEXT:    ret
 ;
-; RV64ZVBC32-LABEL: clmul_nxv1i32_vv:
+; RV64ZVBC32-LABEL: clmul_nxv4i16_vx:
 ; RV64ZVBC32:       # %bb.0:
-; RV64ZVBC32-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
-; RV64ZVBC32-NEXT:    vand.vi v10, v9, 2
-; RV64ZVBC32-NEXT:    vand.vi v11, v9, 1
-; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v11, v10
-; RV64ZVBC32-NEXT:    vand.vi v11, v9, 4
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vi v11, v9, 8
-; RV64ZVBC32-NEXT:    li a0, 16
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 32
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 64
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 128
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 256
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 512
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 1024
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 1
-; RV64ZVBC32-NEXT:    slli a0, a0, 11
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 1
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 2
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 4
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 8
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 16
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 32
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 64
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 128
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 256
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 512
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 1024
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 2048
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 4096
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 8192
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 16384
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 32768
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 65536
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 131072
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 262144
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 524288
-; RV64ZVBC32-NEXT:    vand.vx v9, v9, a0
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vmul.vv v8, v8, v9
-; RV64ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; RV64ZVBC32-NEXT:    vclmul.vx v8, v8, a0
 ; RV64ZVBC32-NEXT:    ret
-  %v = call <vscale x 1 x i32> @llvm.clmul.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %vb)
-  ret <vscale x 1 x i32> %v
+  %elt.head = insertelement <vscale x 4 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i16> %elt.head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i16> @llvm.clmul.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb)
+  ret <vscale x 4 x i16> %v
 }
 
-define <vscale x 1 x i32> @clmul_nxv1i32_vx(<vscale x 1 x i32> %va, i32 %b) nounwind {
-; RV32V-LABEL: clmul_nxv1i32_vx:
+define <vscale x 8 x i16> @clmul_nxv8i16_vv(<vscale x 8 x i16> %va, <vscale x 8 x i16> %vb) nounwind {
+; RV32V-LABEL: clmul_nxv8i16_vv:
 ; RV32V:       # %bb.0:
-; RV32V-NEXT:    addi sp, sp, -32
-; RV32V-NEXT:    sw s0, 28(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s1, 24(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s2, 20(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s3, 16(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s4, 12(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s5, 8(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s6, 4(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s7, 0(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    andi s6, a0, 2
-; RV32V-NEXT:    andi s5, a0, 1
-; RV32V-NEXT:    andi s3, a0, 4
-; RV32V-NEXT:    andi s1, a0, 8
-; RV32V-NEXT:    andi t6, a0, 16
-; RV32V-NEXT:    andi t4, a0, 32
-; RV32V-NEXT:    andi t2, a0, 64
-; RV32V-NEXT:    andi t0, a0, 128
-; RV32V-NEXT:    andi a6, a0, 256
-; RV32V-NEXT:    andi a4, a0, 512
-; RV32V-NEXT:    andi a2, a0, 1024
-; RV32V-NEXT:    li a1, 1
-; RV32V-NEXT:    lui a3, 1
-; RV32V-NEXT:    lui a5, 2
-; RV32V-NEXT:    lui a7, 4
-; RV32V-NEXT:    lui t1, 8
-; RV32V-NEXT:    lui t3, 16
-; RV32V-NEXT:    lui t5, 32
-; RV32V-NEXT:    lui s0, 64
-; RV32V-NEXT:    lui s2, 128
-; RV32V-NEXT:    lui s4, 256
-; RV32V-NEXT:    vsetvli s7, zero, e32, mf2, ta, ma
-; RV32V-NEXT:    vmul.vx v9, v8, s6
-; RV32V-NEXT:    lui s6, 512
-; RV32V-NEXT:    vmul.vx v10, v8, s5
-; RV32V-NEXT:    lui s5, 1024
-; RV32V-NEXT:    vxor.vv v9, v10, v9
-; RV32V-NEXT:    vmul.vx v10, v8, s3
-; RV32V-NEXT:    lui s3, 2048
-; RV32V-NEXT:    vxor.vv v9, v9, v10
-; RV32V-NEXT:    vmul.vx v10, v8, s1
-; RV32V-NEXT:    lui s1, 4096
-; RV32V-NEXT:    vxor.vv v9, v9, v10
-; RV32V-NEXT:    vmul.vx v10, v8, t6
-; RV32V-NEXT:    lui t6, 8192
-; RV32V-NEXT:    vxor.vv v9, v9, v10
-; RV32V-NEXT:    vmul.vx v10, v8, t4
-; RV32V-NEXT:    lui t4, 16384
-; RV32V-NEXT:    vxor.vv v9, v9, v10
-; RV32V-NEXT:    vmul.vx v10, v8, t2
-; RV32V-NEXT:    lui t2, 32768
-; RV32V-NEXT:    vxor.vv v9, v9, v10
-; RV32V-NEXT:    vmul.vx v10, v8, t0
-; RV32V-NEXT:    lui t0, 65536
-; RV32V-NEXT:    vxor.vv v9, v9, v10
-; RV32V-NEXT:    vmul.vx v10, v8, a6
-; RV32V-NEXT:    lui a6, 131072
-; RV32V-NEXT:    vxor.vv v9, v9, v10
-; RV32V-NEXT:    vmul.vx v10, v8, a4
-; RV32V-NEXT:    lui a4, 262144
-; RV32V-NEXT:    vxor.vv v9, v9, v10
-; RV32V-NEXT:    vmul.vx v10, v8, a2
-; RV32V-NEXT:    lui a2, 524288
-; RV32V-NEXT:    slli a1, a1, 11
-; RV32V-NEXT:    and a3, a0, a3
-; RV32V-NEXT:    and a5, a0, a5
-; RV32V-NEXT:    and a7, a0, a7
-; RV32V-NEXT:    and t1, a0, t1
-; RV32V-NEXT:    and t3, a0, t3
-; RV32V-NEXT:    and t5, a0, t5
-; RV32V-NEXT:    and s0, a0, s0
-; RV32V-NEXT:    and s2, a0, s2
-; RV32V-NEXT:    and s4, a0, s4
-; RV32V-NEXT:    and s6, a0, s6
-; RV32V-NEXT:    and s5, a0, s5
-; RV32V-NEXT:    and s3, a0, s3
-; RV32V-NEXT:    and s1, a0, s1
-; RV32V-NEXT:    and t6, a0, t6
-; RV32V-NEXT:    and t4, a0, t4
-; RV32V-NEXT:    and t2, a0, t2
-; RV32V-NEXT:    and t0, a0, t0
-; RV32V-NEXT:    and a6, a0, a6
-; RV32V-NEXT:    and a4, a0, a4
-; RV32V-NEXT:    and a2, a0, a2
-; RV32V-NEXT:    and a0, a0, a1
-; RV32V-NEXT:    vxor.vv v9, v9, v10
-; RV32V-NEXT:    vmul.vx v10, v8, a0
-; RV32V-NEXT:    vxor.vv v9, v9, v10
-; RV32V-NEXT:    vmul.vx v10, v8, a3
-; RV32V-NEXT:    vxor.vv v9, v9, v10
-; RV32V-NEXT:    vmul.vx v10, v8, a5
-; RV32V-NEXT:    vxor.vv v9, v9, v10
-; RV32V-NEXT:    vmul.vx v10, v8, a7
-; RV32V-NEXT:    vxor.vv v9, v9, v10
-; RV32V-NEXT:    vmul.vx v10, v8, t1
-; RV32V-NEXT:    vxor.vv v9, v9, v10
-; RV32V-NEXT:    vmul.vx v10, v8, t3
-; RV32V-NEXT:    vxor.vv v9, v9, v10
-; RV32V-NEXT:    vmul.vx v10, v8, t5
-; RV32V-NEXT:    vxor.vv v9, v9, v10
-; RV32V-NEXT:    vmul.vx v10, v8, s0
-; RV32V-NEXT:    vxor.vv v9, v9, v10
-; RV32V-NEXT:    vmul.vx v10, v8, s2
-; RV32V-NEXT:    vxor.vv v9, v9, v10
-; RV32V-NEXT:    vmul.vx v10, v8, s4
-; RV32V-NEXT:    vxor.vv v9, v9, v10
-; RV32V-NEXT:    vmul.vx v10, v8, s6
-; RV32V-NEXT:    vxor.vv v9, v9, v10
-; RV32V-NEXT:    vmul.vx v10, v8, s5
-; RV32V-NEXT:    vxor.vv v9, v9, v10
-; RV32V-NEXT:    vmul.vx v10, v8, s3
-; RV32V-NEXT:    vxor.vv v9, v9, v10
-; RV32V-NEXT:    vmul.vx v10, v8, s1
-; RV32V-NEXT:    vxor.vv v9, v9, v10
-; RV32V-NEXT:    vmul.vx v10, v8, t6
-; RV32V-NEXT:    vxor.vv v9, v9, v10
-; RV32V-NEXT:    vmul.vx v10, v8, t4
-; RV32V-NEXT:    vxor.vv v9, v9, v10
-; RV32V-NEXT:    vmul.vx v10, v8, t2
-; RV32V-NEXT:    vxor.vv v9, v9, v10
-; RV32V-NEXT:    vmul.vx v10, v8, t0
-; RV32V-NEXT:    vxor.vv v9, v9, v10
-; RV32V-NEXT:    vmul.vx v10, v8, a6
-; RV32V-NEXT:    vxor.vv v9, v9, v10
-; RV32V-NEXT:    vmul.vx v10, v8, a4
-; RV32V-NEXT:    vxor.vv v9, v9, v10
-; RV32V-NEXT:    vmul.vx v8, v8, a2
-; RV32V-NEXT:    vxor.vv v8, v9, v8
-; RV32V-NEXT:    lw s0, 28(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s1, 24(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s2, 20(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s3, 16(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s4, 12(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s5, 8(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s6, 4(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s7, 0(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    addi sp, sp, 32
-; RV32V-NEXT:    ret
-;
-; RV64V-LABEL: clmul_nxv1i32_vx:
-; RV64V:       # %bb.0:
-; RV64V-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
-; RV64V-NEXT:    vmv.v.x v9, a0
-; RV64V-NEXT:    li a0, 16
-; RV64V-NEXT:    vand.vi v10, v9, 2
-; RV64V-NEXT:    vand.vi v11, v9, 1
-; RV64V-NEXT:    vmul.vv v10, v8, v10
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v11, v10
-; RV64V-NEXT:    vand.vi v11, v9, 4
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vi v11, v9, 8
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; RV32V-NEXT:    vand.vi v12, v10, 2
+; RV32V-NEXT:    vand.vi v14, v10, 1
+; RV32V-NEXT:    vmul.vv v12, v8, v12
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v14, v12
+; RV32V-NEXT:    vand.vi v14, v10, 4
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vi v14, v10, 8
+; RV32V-NEXT:    li a0, 16
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    li a0, 32
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    li a0, 64
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    li a0, 128
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    li a0, 256
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    li a0, 512
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    li a0, 1024
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    li a0, 1
+; RV32V-NEXT:    slli a0, a0, 11
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    lui a0, 1
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    lui a0, 2
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    lui a0, 4
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    lui a0, 8
+; RV32V-NEXT:    vand.vx v10, v10, a0
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vmul.vv v8, v8, v10
+; RV32V-NEXT:    vxor.vv v8, v12, v8
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: clmul_nxv8i16_vv:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; RV64V-NEXT:    vand.vi v12, v10, 2
+; RV64V-NEXT:    vand.vi v14, v10, 1
+; RV64V-NEXT:    vmul.vv v12, v8, v12
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v14, v12
+; RV64V-NEXT:    vand.vi v14, v10, 4
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vi v14, v10, 8
+; RV64V-NEXT:    li a0, 16
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
 ; RV64V-NEXT:    li a0, 32
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
 ; RV64V-NEXT:    li a0, 64
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
 ; RV64V-NEXT:    li a0, 128
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
 ; RV64V-NEXT:    li a0, 256
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
 ; RV64V-NEXT:    li a0, 512
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
 ; RV64V-NEXT:    li a0, 1024
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
 ; RV64V-NEXT:    li a0, 1
 ; RV64V-NEXT:    slli a0, a0, 11
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
 ; RV64V-NEXT:    lui a0, 1
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
 ; RV64V-NEXT:    lui a0, 2
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
 ; RV64V-NEXT:    lui a0, 4
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
 ; RV64V-NEXT:    lui a0, 8
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 16
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 32
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 64
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 128
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 256
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 512
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 1024
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 2048
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 4096
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 8192
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 16384
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 32768
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 65536
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 131072
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 262144
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 524288
-; RV64V-NEXT:    vand.vx v9, v9, a0
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vmul.vv v8, v8, v9
-; RV64V-NEXT:    vxor.vv v8, v10, v8
+; RV64V-NEXT:    vand.vx v10, v10, a0
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vmul.vv v8, v8, v10
+; RV64V-NEXT:    vxor.vv v8, v12, v8
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC64-LABEL: clmul_nxv1i32_vx:
+; RV32ZVBC64-LABEL: clmul_nxv8i16_vv:
 ; RV32ZVBC64:       # %bb.0:
-; RV32ZVBC64-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
-; RV32ZVBC64-NEXT:    vmv.v.x v9, a0
-; RV32ZVBC64-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
-; RV32ZVBC64-NEXT:    vzext.vf2 v10, v8
-; RV32ZVBC64-NEXT:    vzext.vf2 v8, v9
-; RV32ZVBC64-NEXT:    vclmul.vv v8, v10, v8
-; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; RV32ZVBC64-NEXT:    vnsrl.wi v8, v8, 0
+; RV32ZVBC64-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf4 v16, v10
+; RV32ZVBC64-NEXT:    vzext.vf4 v24, v8
+; RV32ZVBC64-NEXT:    vclmul.vv v8, v24, v16
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v16, v8, 0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v16, 0
 ; RV32ZVBC64-NEXT:    ret
 ;
-; RV64ZVBC64-LABEL: clmul_nxv1i32_vx:
+; RV64ZVBC64-LABEL: clmul_nxv8i16_vv:
 ; RV64ZVBC64:       # %bb.0:
-; RV64ZVBC64-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
-; RV64ZVBC64-NEXT:    vmv.v.x v9, a0
-; RV64ZVBC64-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
-; RV64ZVBC64-NEXT:    vzext.vf2 v10, v8
-; RV64ZVBC64-NEXT:    vzext.vf2 v8, v9
-; RV64ZVBC64-NEXT:    vclmul.vv v8, v10, v8
-; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; RV64ZVBC64-NEXT:    vnsrl.wi v8, v8, 0
+; RV64ZVBC64-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf4 v16, v10
+; RV64ZVBC64-NEXT:    vzext.vf4 v24, v8
+; RV64ZVBC64-NEXT:    vclmul.vv v8, v24, v16
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v16, v8, 0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v16, 0
 ; RV64ZVBC64-NEXT:    ret
 ;
-; RV32ZVBC32-LABEL: clmul_nxv1i32_vx:
+; RV32ZVBC32-LABEL: clmul_nxv8i16_vv:
 ; RV32ZVBC32:       # %bb.0:
-; RV32ZVBC32-NEXT:    addi sp, sp, -32
-; RV32ZVBC32-NEXT:    sw s0, 28(sp) # 4-byte Folded Spill
-; RV32ZVBC32-NEXT:    sw s1, 24(sp) # 4-byte Folded Spill
-; RV32ZVBC32-NEXT:    sw s2, 20(sp) # 4-byte Folded Spill
-; RV32ZVBC32-NEXT:    sw s3, 16(sp) # 4-byte Folded Spill
-; RV32ZVBC32-NEXT:    sw s4, 12(sp) # 4-byte Folded Spill
-; RV32ZVBC32-NEXT:    sw s5, 8(sp) # 4-byte Folded Spill
-; RV32ZVBC32-NEXT:    sw s6, 4(sp) # 4-byte Folded Spill
-; RV32ZVBC32-NEXT:    sw s7, 0(sp) # 4-byte Folded Spill
-; RV32ZVBC32-NEXT:    andi s6, a0, 2
-; RV32ZVBC32-NEXT:    andi s5, a0, 1
-; RV32ZVBC32-NEXT:    andi s3, a0, 4
-; RV32ZVBC32-NEXT:    andi s1, a0, 8
-; RV32ZVBC32-NEXT:    andi t6, a0, 16
-; RV32ZVBC32-NEXT:    andi t4, a0, 32
-; RV32ZVBC32-NEXT:    andi t2, a0, 64
-; RV32ZVBC32-NEXT:    andi t0, a0, 128
-; RV32ZVBC32-NEXT:    andi a6, a0, 256
-; RV32ZVBC32-NEXT:    andi a4, a0, 512
-; RV32ZVBC32-NEXT:    andi a2, a0, 1024
-; RV32ZVBC32-NEXT:    li a1, 1
-; RV32ZVBC32-NEXT:    lui a3, 1
-; RV32ZVBC32-NEXT:    lui a5, 2
-; RV32ZVBC32-NEXT:    lui a7, 4
-; RV32ZVBC32-NEXT:    lui t1, 8
-; RV32ZVBC32-NEXT:    lui t3, 16
-; RV32ZVBC32-NEXT:    lui t5, 32
-; RV32ZVBC32-NEXT:    lui s0, 64
-; RV32ZVBC32-NEXT:    lui s2, 128
-; RV32ZVBC32-NEXT:    lui s4, 256
-; RV32ZVBC32-NEXT:    vsetvli s7, zero, e32, mf2, ta, ma
-; RV32ZVBC32-NEXT:    vmul.vx v9, v8, s6
-; RV32ZVBC32-NEXT:    lui s6, 512
-; RV32ZVBC32-NEXT:    vmul.vx v10, v8, s5
-; RV32ZVBC32-NEXT:    lui s5, 1024
-; RV32ZVBC32-NEXT:    vxor.vv v9, v10, v9
-; RV32ZVBC32-NEXT:    vmul.vx v10, v8, s3
-; RV32ZVBC32-NEXT:    lui s3, 2048
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
-; RV32ZVBC32-NEXT:    vmul.vx v10, v8, s1
-; RV32ZVBC32-NEXT:    lui s1, 4096
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
-; RV32ZVBC32-NEXT:    vmul.vx v10, v8, t6
-; RV32ZVBC32-NEXT:    lui t6, 8192
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
-; RV32ZVBC32-NEXT:    vmul.vx v10, v8, t4
-; RV32ZVBC32-NEXT:    lui t4, 16384
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
-; RV32ZVBC32-NEXT:    vmul.vx v10, v8, t2
-; RV32ZVBC32-NEXT:    lui t2, 32768
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
-; RV32ZVBC32-NEXT:    vmul.vx v10, v8, t0
-; RV32ZVBC32-NEXT:    lui t0, 65536
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
-; RV32ZVBC32-NEXT:    vmul.vx v10, v8, a6
-; RV32ZVBC32-NEXT:    lui a6, 131072
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
-; RV32ZVBC32-NEXT:    vmul.vx v10, v8, a4
-; RV32ZVBC32-NEXT:    lui a4, 262144
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
-; RV32ZVBC32-NEXT:    vmul.vx v10, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 524288
-; RV32ZVBC32-NEXT:    slli a1, a1, 11
-; RV32ZVBC32-NEXT:    and a3, a0, a3
-; RV32ZVBC32-NEXT:    and a5, a0, a5
-; RV32ZVBC32-NEXT:    and a7, a0, a7
-; RV32ZVBC32-NEXT:    and t1, a0, t1
-; RV32ZVBC32-NEXT:    and t3, a0, t3
-; RV32ZVBC32-NEXT:    and t5, a0, t5
-; RV32ZVBC32-NEXT:    and s0, a0, s0
-; RV32ZVBC32-NEXT:    and s2, a0, s2
-; RV32ZVBC32-NEXT:    and s4, a0, s4
-; RV32ZVBC32-NEXT:    and s6, a0, s6
-; RV32ZVBC32-NEXT:    and s5, a0, s5
-; RV32ZVBC32-NEXT:    and s3, a0, s3
-; RV32ZVBC32-NEXT:    and s1, a0, s1
-; RV32ZVBC32-NEXT:    and t6, a0, t6
-; RV32ZVBC32-NEXT:    and t4, a0, t4
-; RV32ZVBC32-NEXT:    and t2, a0, t2
-; RV32ZVBC32-NEXT:    and t0, a0, t0
-; RV32ZVBC32-NEXT:    and a6, a0, a6
-; RV32ZVBC32-NEXT:    and a4, a0, a4
-; RV32ZVBC32-NEXT:    and a2, a0, a2
-; RV32ZVBC32-NEXT:    and a0, a0, a1
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
-; RV32ZVBC32-NEXT:    vmul.vx v10, v8, a0
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
-; RV32ZVBC32-NEXT:    vmul.vx v10, v8, a3
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
-; RV32ZVBC32-NEXT:    vmul.vx v10, v8, a5
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
-; RV32ZVBC32-NEXT:    vmul.vx v10, v8, a7
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
-; RV32ZVBC32-NEXT:    vmul.vx v10, v8, t1
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
-; RV32ZVBC32-NEXT:    vmul.vx v10, v8, t3
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
-; RV32ZVBC32-NEXT:    vmul.vx v10, v8, t5
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
-; RV32ZVBC32-NEXT:    vmul.vx v10, v8, s0
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
-; RV32ZVBC32-NEXT:    vmul.vx v10, v8, s2
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
-; RV32ZVBC32-NEXT:    vmul.vx v10, v8, s4
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
-; RV32ZVBC32-NEXT:    vmul.vx v10, v8, s6
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
-; RV32ZVBC32-NEXT:    vmul.vx v10, v8, s5
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
-; RV32ZVBC32-NEXT:    vmul.vx v10, v8, s3
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
-; RV32ZVBC32-NEXT:    vmul.vx v10, v8, s1
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
-; RV32ZVBC32-NEXT:    vmul.vx v10, v8, t6
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
-; RV32ZVBC32-NEXT:    vmul.vx v10, v8, t4
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
-; RV32ZVBC32-NEXT:    vmul.vx v10, v8, t2
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
-; RV32ZVBC32-NEXT:    vmul.vx v10, v8, t0
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
-; RV32ZVBC32-NEXT:    vmul.vx v10, v8, a6
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
-; RV32ZVBC32-NEXT:    vmul.vx v10, v8, a4
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
-; RV32ZVBC32-NEXT:    vmul.vx v8, v8, a2
-; RV32ZVBC32-NEXT:    vxor.vv v8, v9, v8
-; RV32ZVBC32-NEXT:    lw s0, 28(sp) # 4-byte Folded Reload
-; RV32ZVBC32-NEXT:    lw s1, 24(sp) # 4-byte Folded Reload
-; RV32ZVBC32-NEXT:    lw s2, 20(sp) # 4-byte Folded Reload
-; RV32ZVBC32-NEXT:    lw s3, 16(sp) # 4-byte Folded Reload
-; RV32ZVBC32-NEXT:    lw s4, 12(sp) # 4-byte Folded Reload
-; RV32ZVBC32-NEXT:    lw s5, 8(sp) # 4-byte Folded Reload
-; RV32ZVBC32-NEXT:    lw s6, 4(sp) # 4-byte Folded Reload
-; RV32ZVBC32-NEXT:    lw s7, 0(sp) # 4-byte Folded Reload
-; RV32ZVBC32-NEXT:    addi sp, sp, 32
+; RV32ZVBC32-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; RV32ZVBC32-NEXT:    vclmul.vv v8, v8, v10
 ; RV32ZVBC32-NEXT:    ret
 ;
-; RV64ZVBC32-LABEL: clmul_nxv1i32_vx:
+; RV64ZVBC32-LABEL: clmul_nxv8i16_vv:
 ; RV64ZVBC32:       # %bb.0:
-; RV64ZVBC32-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
-; RV64ZVBC32-NEXT:    vmv.v.x v9, a0
-; RV64ZVBC32-NEXT:    li a0, 16
-; RV64ZVBC32-NEXT:    vand.vi v10, v9, 2
-; RV64ZVBC32-NEXT:    vand.vi v11, v9, 1
-; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v11, v10
-; RV64ZVBC32-NEXT:    vand.vi v11, v9, 4
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vi v11, v9, 8
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 32
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 64
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 128
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 256
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 512
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 1024
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 1
-; RV64ZVBC32-NEXT:    slli a0, a0, 11
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 1
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 2
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 4
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 8
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 16
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 32
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 64
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 128
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 256
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 512
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 1024
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 2048
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 4096
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 8192
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 16384
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 32768
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 65536
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 131072
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 262144
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 524288
-; RV64ZVBC32-NEXT:    vand.vx v9, v9, a0
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vmul.vv v8, v8, v9
-; RV64ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; RV64ZVBC32-NEXT:    vclmul.vv v8, v8, v10
 ; RV64ZVBC32-NEXT:    ret
-  %elt.head = insertelement <vscale x 1 x i32> poison, i32 %b, i32 0
-  %vb = shufflevector <vscale x 1 x i32> %elt.head, <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
-  %v = call <vscale x 1 x i32> @llvm.clmul.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %vb)
-  ret <vscale x 1 x i32> %v
+  %v = call <vscale x 8 x i16> @llvm.clmul.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %vb)
+  ret <vscale x 8 x i16> %v
 }
 
-define <vscale x 2 x i32> @clmul_nxv2i32_vv(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb) nounwind {
-; RV32V-LABEL: clmul_nxv2i32_vv:
+define <vscale x 8 x i16> @clmul_nxv8i16_vx(<vscale x 8 x i16> %va, i16 %b) nounwind {
+; RV32V-LABEL: clmul_nxv8i16_vx:
 ; RV32V:       # %bb.0:
-; RV32V-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; RV32V-NEXT:    vand.vi v10, v9, 2
-; RV32V-NEXT:    vand.vi v11, v9, 1
-; RV32V-NEXT:    vmul.vv v10, v8, v10
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v11, v10
-; RV32V-NEXT:    vand.vi v11, v9, 4
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vi v11, v9, 8
+; RV32V-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; RV32V-NEXT:    vmv.v.x v10, a0
 ; RV32V-NEXT:    li a0, 16
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    vand.vi v12, v10, 2
+; RV32V-NEXT:    vand.vi v14, v10, 1
+; RV32V-NEXT:    vmul.vv v12, v8, v12
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v14, v12
+; RV32V-NEXT:    vand.vi v14, v10, 4
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vi v14, v10, 8
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
 ; RV32V-NEXT:    li a0, 32
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
 ; RV32V-NEXT:    li a0, 64
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
 ; RV32V-NEXT:    li a0, 128
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
 ; RV32V-NEXT:    li a0, 256
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
 ; RV32V-NEXT:    li a0, 512
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
 ; RV32V-NEXT:    li a0, 1024
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
 ; RV32V-NEXT:    li a0, 1
 ; RV32V-NEXT:    slli a0, a0, 11
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
 ; RV32V-NEXT:    lui a0, 1
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
 ; RV32V-NEXT:    lui a0, 2
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
 ; RV32V-NEXT:    lui a0, 4
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
 ; RV32V-NEXT:    lui a0, 8
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
-; RV32V-NEXT:    lui a0, 16
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
-; RV32V-NEXT:    lui a0, 32
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
-; RV32V-NEXT:    lui a0, 64
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
-; RV32V-NEXT:    lui a0, 128
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
-; RV32V-NEXT:    lui a0, 256
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
-; RV32V-NEXT:    lui a0, 512
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
-; RV32V-NEXT:    lui a0, 1024
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
-; RV32V-NEXT:    lui a0, 2048
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
-; RV32V-NEXT:    lui a0, 4096
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
-; RV32V-NEXT:    lui a0, 8192
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
-; RV32V-NEXT:    lui a0, 16384
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
-; RV32V-NEXT:    lui a0, 32768
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
-; RV32V-NEXT:    lui a0, 65536
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
-; RV32V-NEXT:    lui a0, 131072
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
-; RV32V-NEXT:    lui a0, 262144
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v9, a0
-; RV32V-NEXT:    lui a0, 524288
-; RV32V-NEXT:    vand.vx v9, v9, a0
-; RV32V-NEXT:    vmul.vv v11, v8, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vmul.vv v8, v8, v9
-; RV32V-NEXT:    vxor.vv v8, v10, v8
+; RV32V-NEXT:    vand.vx v10, v10, a0
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vmul.vv v8, v8, v10
+; RV32V-NEXT:    vxor.vv v8, v12, v8
 ; RV32V-NEXT:    ret
 ;
-; RV64V-LABEL: clmul_nxv2i32_vv:
+; RV64V-LABEL: clmul_nxv8i16_vx:
 ; RV64V:       # %bb.0:
-; RV64V-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; RV64V-NEXT:    vand.vi v10, v9, 2
-; RV64V-NEXT:    vand.vi v11, v9, 1
-; RV64V-NEXT:    vmul.vv v10, v8, v10
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v11, v10
-; RV64V-NEXT:    vand.vi v11, v9, 4
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vi v11, v9, 8
+; RV64V-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; RV64V-NEXT:    vmv.v.x v10, a0
 ; RV64V-NEXT:    li a0, 16
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    vand.vi v12, v10, 2
+; RV64V-NEXT:    vand.vi v14, v10, 1
+; RV64V-NEXT:    vmul.vv v12, v8, v12
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v14, v12
+; RV64V-NEXT:    vand.vi v14, v10, 4
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vi v14, v10, 8
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
 ; RV64V-NEXT:    li a0, 32
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
 ; RV64V-NEXT:    li a0, 64
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
 ; RV64V-NEXT:    li a0, 128
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
 ; RV64V-NEXT:    li a0, 256
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
 ; RV64V-NEXT:    li a0, 512
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
 ; RV64V-NEXT:    li a0, 1024
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
 ; RV64V-NEXT:    li a0, 1
 ; RV64V-NEXT:    slli a0, a0, 11
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
 ; RV64V-NEXT:    lui a0, 1
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
 ; RV64V-NEXT:    lui a0, 2
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
 ; RV64V-NEXT:    lui a0, 4
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
 ; RV64V-NEXT:    lui a0, 8
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 16
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 32
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 64
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 128
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 256
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 512
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 1024
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 2048
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 4096
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 8192
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 16384
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 32768
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 65536
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 131072
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 262144
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 524288
-; RV64V-NEXT:    vand.vx v9, v9, a0
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vmul.vv v8, v8, v9
-; RV64V-NEXT:    vxor.vv v8, v10, v8
+; RV64V-NEXT:    vand.vx v10, v10, a0
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vmul.vv v8, v8, v10
+; RV64V-NEXT:    vxor.vv v8, v12, v8
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC64-LABEL: clmul_nxv2i32_vv:
+; RV32ZVBC64-LABEL: clmul_nxv8i16_vx:
 ; RV32ZVBC64:       # %bb.0:
-; RV32ZVBC64-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV32ZVBC64-NEXT:    vzext.vf2 v10, v9
-; RV32ZVBC64-NEXT:    vzext.vf2 v12, v8
-; RV32ZVBC64-NEXT:    vclmul.vv v10, v12, v10
-; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV32ZVBC64-NEXT:    vnsrl.wi v8, v10, 0
+; RV32ZVBC64-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; RV32ZVBC64-NEXT:    vmv.v.x v24, a0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf4 v16, v8
+; RV32ZVBC64-NEXT:    vzext.vf4 v8, v24
+; RV32ZVBC64-NEXT:    vclmul.vv v8, v16, v8
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v16, v8, 0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v16, 0
 ; RV32ZVBC64-NEXT:    ret
 ;
-; RV64ZVBC64-LABEL: clmul_nxv2i32_vv:
+; RV64ZVBC64-LABEL: clmul_nxv8i16_vx:
 ; RV64ZVBC64:       # %bb.0:
-; RV64ZVBC64-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV64ZVBC64-NEXT:    vzext.vf2 v10, v9
-; RV64ZVBC64-NEXT:    vzext.vf2 v12, v8
-; RV64ZVBC64-NEXT:    vclmul.vv v10, v12, v10
-; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVBC64-NEXT:    vnsrl.wi v8, v10, 0
+; RV64ZVBC64-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; RV64ZVBC64-NEXT:    vmv.v.x v24, a0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf4 v16, v8
+; RV64ZVBC64-NEXT:    vzext.vf4 v8, v24
+; RV64ZVBC64-NEXT:    vclmul.vv v8, v16, v8
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v16, v8, 0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v16, 0
 ; RV64ZVBC64-NEXT:    ret
 ;
-; RV32ZVBC32-LABEL: clmul_nxv2i32_vv:
+; RV32ZVBC32-LABEL: clmul_nxv8i16_vx:
 ; RV32ZVBC32:       # %bb.0:
-; RV32ZVBC32-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; RV32ZVBC32-NEXT:    vand.vi v10, v9, 2
-; RV32ZVBC32-NEXT:    vand.vi v11, v9, 1
-; RV32ZVBC32-NEXT:    vmul.vv v10, v8, v10
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v11, v10
-; RV32ZVBC32-NEXT:    vand.vi v11, v9, 4
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vi v11, v9, 8
-; RV32ZVBC32-NEXT:    li a0, 16
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 32
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 64
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 128
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 256
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 512
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 1024
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    li a0, 1
-; RV32ZVBC32-NEXT:    slli a0, a0, 11
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    lui a0, 1
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    lui a0, 2
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    lui a0, 4
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    lui a0, 8
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    lui a0, 16
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    lui a0, 32
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    lui a0, 64
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    lui a0, 128
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    lui a0, 256
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    lui a0, 512
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    lui a0, 1024
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    lui a0, 2048
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    lui a0, 4096
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    lui a0, 8192
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    lui a0, 16384
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    lui a0, 32768
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    lui a0, 65536
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    lui a0, 131072
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    lui a0, 262144
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV32ZVBC32-NEXT:    lui a0, 524288
-; RV32ZVBC32-NEXT:    vand.vx v9, v9, a0
-; RV32ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vmul.vv v8, v8, v9
-; RV32ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV32ZVBC32-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; RV32ZVBC32-NEXT:    vclmul.vx v8, v8, a0
 ; RV32ZVBC32-NEXT:    ret
 ;
-; RV64ZVBC32-LABEL: clmul_nxv2i32_vv:
+; RV64ZVBC32-LABEL: clmul_nxv8i16_vx:
 ; RV64ZVBC32:       # %bb.0:
-; RV64ZVBC32-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; RV64ZVBC32-NEXT:    vand.vi v10, v9, 2
-; RV64ZVBC32-NEXT:    vand.vi v11, v9, 1
-; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v11, v10
-; RV64ZVBC32-NEXT:    vand.vi v11, v9, 4
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vi v11, v9, 8
-; RV64ZVBC32-NEXT:    li a0, 16
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 32
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 64
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 128
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 256
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 512
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 1024
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 1
-; RV64ZVBC32-NEXT:    slli a0, a0, 11
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 1
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 2
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 4
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 8
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 16
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 32
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 64
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 128
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 256
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 512
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 1024
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 2048
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 4096
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 8192
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 16384
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 32768
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 65536
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 131072
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 262144
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 524288
-; RV64ZVBC32-NEXT:    vand.vx v9, v9, a0
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vmul.vv v8, v8, v9
-; RV64ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; RV64ZVBC32-NEXT:    vclmul.vx v8, v8, a0
 ; RV64ZVBC32-NEXT:    ret
-  %v = call <vscale x 2 x i32> @llvm.clmul.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb)
-  ret <vscale x 2 x i32> %v
+  %elt.head = insertelement <vscale x 8 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i16> %elt.head, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i16> @llvm.clmul.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %vb)
+  ret <vscale x 8 x i16> %v
 }
 
-define <vscale x 2 x i32> @clmul_nxv2i32_vx(<vscale x 2 x i32> %va, i32 %b) nounwind {
-; RV32V-LABEL: clmul_nxv2i32_vx:
+define <vscale x 16 x i16> @clmul_nxv16i16_vv(<vscale x 16 x i16> %va, <vscale x 16 x i16> %vb) nounwind {
+; RV32V-LABEL: clmul_nxv16i16_vv:
 ; RV32V:       # %bb.0:
-; RV32V-NEXT:    addi sp, sp, -32
-; RV32V-NEXT:    sw s0, 28(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s1, 24(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s2, 20(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s3, 16(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s4, 12(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s5, 8(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s6, 4(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s7, 0(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    andi s6, a0, 2
-; RV32V-NEXT:    andi s5, a0, 1
-; RV32V-NEXT:    andi s3, a0, 4
-; RV32V-NEXT:    andi s1, a0, 8
-; RV32V-NEXT:    andi t6, a0, 16
-; RV32V-NEXT:    andi t4, a0, 32
-; RV32V-NEXT:    andi t2, a0, 64
-; RV32V-NEXT:    andi t0, a0, 128
-; RV32V-NEXT:    andi a6, a0, 256
-; RV32V-NEXT:    andi a4, a0, 512
-; RV32V-NEXT:    andi a2, a0, 1024
-; RV32V-NEXT:    li a1, 1
-; RV32V-NEXT:    lui a3, 1
-; RV32V-NEXT:    lui a5, 2
-; RV32V-NEXT:    lui a7, 4
-; RV32V-NEXT:    lui t1, 8
-; RV32V-NEXT:    lui t3, 16
-; RV32V-NEXT:    lui t5, 32
-; RV32V-NEXT:    lui s0, 64
-; RV32V-NEXT:    lui s2, 128
-; RV32V-NEXT:    lui s4, 256
-; RV32V-NEXT:    vsetvli s7, zero, e32, m1, ta, ma
-; RV32V-NEXT:    vmul.vx v9, v8, s6
-; RV32V-NEXT:    lui s6, 512
-; RV32V-NEXT:    vmul.vx v10, v8, s5
-; RV32V-NEXT:    lui s5, 1024
-; RV32V-NEXT:    vxor.vv v9, v10, v9
-; RV32V-NEXT:    vmul.vx v10, v8, s3
-; RV32V-NEXT:    lui s3, 2048
-; RV32V-NEXT:    vxor.vv v9, v9, v10
-; RV32V-NEXT:    vmul.vx v10, v8, s1
-; RV32V-NEXT:    lui s1, 4096
-; RV32V-NEXT:    vxor.vv v9, v9, v10
-; RV32V-NEXT:    vmul.vx v10, v8, t6
-; RV32V-NEXT:    lui t6, 8192
-; RV32V-NEXT:    vxor.vv v9, v9, v10
-; RV32V-NEXT:    vmul.vx v10, v8, t4
-; RV32V-NEXT:    lui t4, 16384
-; RV32V-NEXT:    vxor.vv v9, v9, v10
-; RV32V-NEXT:    vmul.vx v10, v8, t2
-; RV32V-NEXT:    lui t2, 32768
-; RV32V-NEXT:    vxor.vv v9, v9, v10
-; RV32V-NEXT:    vmul.vx v10, v8, t0
-; RV32V-NEXT:    lui t0, 65536
-; RV32V-NEXT:    vxor.vv v9, v9, v10
-; RV32V-NEXT:    vmul.vx v10, v8, a6
-; RV32V-NEXT:    lui a6, 131072
-; RV32V-NEXT:    vxor.vv v9, v9, v10
-; RV32V-NEXT:    vmul.vx v10, v8, a4
-; RV32V-NEXT:    lui a4, 262144
-; RV32V-NEXT:    vxor.vv v9, v9, v10
-; RV32V-NEXT:    vmul.vx v10, v8, a2
-; RV32V-NEXT:    lui a2, 524288
-; RV32V-NEXT:    slli a1, a1, 11
-; RV32V-NEXT:    and a3, a0, a3
-; RV32V-NEXT:    and a5, a0, a5
-; RV32V-NEXT:    and a7, a0, a7
-; RV32V-NEXT:    and t1, a0, t1
-; RV32V-NEXT:    and t3, a0, t3
-; RV32V-NEXT:    and t5, a0, t5
-; RV32V-NEXT:    and s0, a0, s0
-; RV32V-NEXT:    and s2, a0, s2
-; RV32V-NEXT:    and s4, a0, s4
-; RV32V-NEXT:    and s6, a0, s6
-; RV32V-NEXT:    and s5, a0, s5
-; RV32V-NEXT:    and s3, a0, s3
-; RV32V-NEXT:    and s1, a0, s1
-; RV32V-NEXT:    and t6, a0, t6
-; RV32V-NEXT:    and t4, a0, t4
-; RV32V-NEXT:    and t2, a0, t2
-; RV32V-NEXT:    and t0, a0, t0
-; RV32V-NEXT:    and a6, a0, a6
-; RV32V-NEXT:    and a4, a0, a4
-; RV32V-NEXT:    and a2, a0, a2
-; RV32V-NEXT:    and a0, a0, a1
-; RV32V-NEXT:    vxor.vv v9, v9, v10
-; RV32V-NEXT:    vmul.vx v10, v8, a0
-; RV32V-NEXT:    vxor.vv v9, v9, v10
-; RV32V-NEXT:    vmul.vx v10, v8, a3
-; RV32V-NEXT:    vxor.vv v9, v9, v10
-; RV32V-NEXT:    vmul.vx v10, v8, a5
-; RV32V-NEXT:    vxor.vv v9, v9, v10
-; RV32V-NEXT:    vmul.vx v10, v8, a7
-; RV32V-NEXT:    vxor.vv v9, v9, v10
-; RV32V-NEXT:    vmul.vx v10, v8, t1
-; RV32V-NEXT:    vxor.vv v9, v9, v10
-; RV32V-NEXT:    vmul.vx v10, v8, t3
-; RV32V-NEXT:    vxor.vv v9, v9, v10
-; RV32V-NEXT:    vmul.vx v10, v8, t5
-; RV32V-NEXT:    vxor.vv v9, v9, v10
-; RV32V-NEXT:    vmul.vx v10, v8, s0
-; RV32V-NEXT:    vxor.vv v9, v9, v10
-; RV32V-NEXT:    vmul.vx v10, v8, s2
-; RV32V-NEXT:    vxor.vv v9, v9, v10
-; RV32V-NEXT:    vmul.vx v10, v8, s4
-; RV32V-NEXT:    vxor.vv v9, v9, v10
-; RV32V-NEXT:    vmul.vx v10, v8, s6
-; RV32V-NEXT:    vxor.vv v9, v9, v10
-; RV32V-NEXT:    vmul.vx v10, v8, s5
-; RV32V-NEXT:    vxor.vv v9, v9, v10
-; RV32V-NEXT:    vmul.vx v10, v8, s3
-; RV32V-NEXT:    vxor.vv v9, v9, v10
-; RV32V-NEXT:    vmul.vx v10, v8, s1
-; RV32V-NEXT:    vxor.vv v9, v9, v10
-; RV32V-NEXT:    vmul.vx v10, v8, t6
-; RV32V-NEXT:    vxor.vv v9, v9, v10
-; RV32V-NEXT:    vmul.vx v10, v8, t4
-; RV32V-NEXT:    vxor.vv v9, v9, v10
-; RV32V-NEXT:    vmul.vx v10, v8, t2
-; RV32V-NEXT:    vxor.vv v9, v9, v10
-; RV32V-NEXT:    vmul.vx v10, v8, t0
-; RV32V-NEXT:    vxor.vv v9, v9, v10
-; RV32V-NEXT:    vmul.vx v10, v8, a6
-; RV32V-NEXT:    vxor.vv v9, v9, v10
-; RV32V-NEXT:    vmul.vx v10, v8, a4
-; RV32V-NEXT:    vxor.vv v9, v9, v10
-; RV32V-NEXT:    vmul.vx v8, v8, a2
-; RV32V-NEXT:    vxor.vv v8, v9, v8
-; RV32V-NEXT:    lw s0, 28(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s1, 24(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s2, 20(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s3, 16(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s4, 12(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s5, 8(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s6, 4(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s7, 0(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    addi sp, sp, 32
+; RV32V-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; RV32V-NEXT:    vand.vi v16, v12, 2
+; RV32V-NEXT:    vand.vi v20, v12, 1
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v20, v16
+; RV32V-NEXT:    vand.vi v20, v12, 4
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vi v20, v12, 8
+; RV32V-NEXT:    li a0, 16
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    li a0, 32
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    li a0, 64
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    li a0, 128
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    li a0, 256
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    li a0, 512
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    li a0, 1024
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    li a0, 1
+; RV32V-NEXT:    slli a0, a0, 11
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    lui a0, 1
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    lui a0, 2
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    lui a0, 4
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    lui a0, 8
+; RV32V-NEXT:    vand.vx v12, v12, a0
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vmul.vv v8, v8, v12
+; RV32V-NEXT:    vxor.vv v8, v16, v8
 ; RV32V-NEXT:    ret
 ;
-; RV64V-LABEL: clmul_nxv2i32_vx:
+; RV64V-LABEL: clmul_nxv16i16_vv:
 ; RV64V:       # %bb.0:
-; RV64V-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV64V-NEXT:    vmv.v.x v9, a0
+; RV64V-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; RV64V-NEXT:    vand.vi v16, v12, 2
+; RV64V-NEXT:    vand.vi v20, v12, 1
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v20, v16
+; RV64V-NEXT:    vand.vi v20, v12, 4
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vi v20, v12, 8
 ; RV64V-NEXT:    li a0, 16
-; RV64V-NEXT:    vand.vi v10, v9, 2
-; RV64V-NEXT:    vand.vi v11, v9, 1
-; RV64V-NEXT:    vmul.vv v10, v8, v10
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v11, v10
-; RV64V-NEXT:    vand.vi v11, v9, 4
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vi v11, v9, 8
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
 ; RV64V-NEXT:    li a0, 32
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
 ; RV64V-NEXT:    li a0, 64
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
 ; RV64V-NEXT:    li a0, 128
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
 ; RV64V-NEXT:    li a0, 256
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
 ; RV64V-NEXT:    li a0, 512
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
 ; RV64V-NEXT:    li a0, 1024
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
 ; RV64V-NEXT:    li a0, 1
 ; RV64V-NEXT:    slli a0, a0, 11
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
 ; RV64V-NEXT:    lui a0, 1
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
 ; RV64V-NEXT:    lui a0, 2
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
 ; RV64V-NEXT:    lui a0, 4
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
 ; RV64V-NEXT:    lui a0, 8
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 16
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 32
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 64
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 128
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 256
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 512
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 1024
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 2048
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 4096
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 8192
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 16384
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 32768
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 65536
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 131072
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 262144
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v9, a0
-; RV64V-NEXT:    lui a0, 524288
-; RV64V-NEXT:    vand.vx v9, v9, a0
-; RV64V-NEXT:    vmul.vv v11, v8, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vmul.vv v8, v8, v9
-; RV64V-NEXT:    vxor.vv v8, v10, v8
+; RV64V-NEXT:    vand.vx v12, v12, a0
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vmul.vv v8, v8, v12
+; RV64V-NEXT:    vxor.vv v8, v16, v8
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC64-LABEL: clmul_nxv2i32_vx:
+; RV32ZVBC64-LABEL: clmul_nxv16i16_vv:
 ; RV32ZVBC64:       # %bb.0:
-; RV32ZVBC64-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV32ZVBC64-NEXT:    vmv.v.x v12, a0
-; RV32ZVBC64-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; RV32ZVBC64-NEXT:    vzext.vf2 v10, v8
-; RV32ZVBC64-NEXT:    vzext.vf2 v8, v12
-; RV32ZVBC64-NEXT:    vclmul.vv v10, v10, v8
-; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV32ZVBC64-NEXT:    vnsrl.wi v8, v10, 0
+; RV32ZVBC64-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; RV32ZVBC64-NEXT:    vand.vi v16, v12, 2
+; RV32ZVBC64-NEXT:    vand.vi v20, v12, 1
+; RV32ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC64-NEXT:    vxor.vv v16, v20, v16
+; RV32ZVBC64-NEXT:    vand.vi v20, v12, 4
+; RV32ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC64-NEXT:    vand.vi v20, v12, 8
+; RV32ZVBC64-NEXT:    li a0, 16
+; RV32ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC64-NEXT:    vand.vx v20, v12, a0
+; RV32ZVBC64-NEXT:    li a0, 32
+; RV32ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC64-NEXT:    vand.vx v20, v12, a0
+; RV32ZVBC64-NEXT:    li a0, 64
+; RV32ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC64-NEXT:    vand.vx v20, v12, a0
+; RV32ZVBC64-NEXT:    li a0, 128
+; RV32ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC64-NEXT:    vand.vx v20, v12, a0
+; RV32ZVBC64-NEXT:    li a0, 256
+; RV32ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC64-NEXT:    vand.vx v20, v12, a0
+; RV32ZVBC64-NEXT:    li a0, 512
+; RV32ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC64-NEXT:    vand.vx v20, v12, a0
+; RV32ZVBC64-NEXT:    li a0, 1024
+; RV32ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC64-NEXT:    vand.vx v20, v12, a0
+; RV32ZVBC64-NEXT:    li a0, 1
+; RV32ZVBC64-NEXT:    slli a0, a0, 11
+; RV32ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC64-NEXT:    vand.vx v20, v12, a0
+; RV32ZVBC64-NEXT:    lui a0, 1
+; RV32ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC64-NEXT:    vand.vx v20, v12, a0
+; RV32ZVBC64-NEXT:    lui a0, 2
+; RV32ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC64-NEXT:    vand.vx v20, v12, a0
+; RV32ZVBC64-NEXT:    lui a0, 4
+; RV32ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC64-NEXT:    vand.vx v20, v12, a0
+; RV32ZVBC64-NEXT:    lui a0, 8
+; RV32ZVBC64-NEXT:    vand.vx v12, v12, a0
+; RV32ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC64-NEXT:    vmul.vv v8, v8, v12
+; RV32ZVBC64-NEXT:    vxor.vv v8, v16, v8
 ; RV32ZVBC64-NEXT:    ret
 ;
-; RV64ZVBC64-LABEL: clmul_nxv2i32_vx:
+; RV64ZVBC64-LABEL: clmul_nxv16i16_vv:
 ; RV64ZVBC64:       # %bb.0:
-; RV64ZVBC64-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV64ZVBC64-NEXT:    vmv.v.x v12, a0
-; RV64ZVBC64-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; RV64ZVBC64-NEXT:    vzext.vf2 v10, v8
-; RV64ZVBC64-NEXT:    vzext.vf2 v8, v12
-; RV64ZVBC64-NEXT:    vclmul.vv v10, v10, v8
-; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVBC64-NEXT:    vnsrl.wi v8, v10, 0
+; RV64ZVBC64-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; RV64ZVBC64-NEXT:    vand.vi v16, v12, 2
+; RV64ZVBC64-NEXT:    vand.vi v20, v12, 1
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC64-NEXT:    vxor.vv v16, v20, v16
+; RV64ZVBC64-NEXT:    vand.vi v20, v12, 4
+; RV64ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC64-NEXT:    vand.vi v20, v12, 8
+; RV64ZVBC64-NEXT:    li a0, 16
+; RV64ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC64-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC64-NEXT:    li a0, 32
+; RV64ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC64-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC64-NEXT:    li a0, 64
+; RV64ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC64-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC64-NEXT:    li a0, 128
+; RV64ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC64-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC64-NEXT:    li a0, 256
+; RV64ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC64-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC64-NEXT:    li a0, 512
+; RV64ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC64-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC64-NEXT:    li a0, 1024
+; RV64ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC64-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC64-NEXT:    li a0, 1
+; RV64ZVBC64-NEXT:    slli a0, a0, 11
+; RV64ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC64-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC64-NEXT:    lui a0, 1
+; RV64ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC64-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC64-NEXT:    lui a0, 2
+; RV64ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC64-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC64-NEXT:    lui a0, 4
+; RV64ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC64-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC64-NEXT:    lui a0, 8
+; RV64ZVBC64-NEXT:    vand.vx v12, v12, a0
+; RV64ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC64-NEXT:    vmul.vv v8, v8, v12
+; RV64ZVBC64-NEXT:    vxor.vv v8, v16, v8
 ; RV64ZVBC64-NEXT:    ret
 ;
-; RV32ZVBC32-LABEL: clmul_nxv2i32_vx:
+; RV32ZVBC32-LABEL: clmul_nxv16i16_vv:
 ; RV32ZVBC32:       # %bb.0:
-; RV32ZVBC32-NEXT:    addi sp, sp, -32
-; RV32ZVBC32-NEXT:    sw s0, 28(sp) # 4-byte Folded Spill
-; RV32ZVBC32-NEXT:    sw s1, 24(sp) # 4-byte Folded Spill
-; RV32ZVBC32-NEXT:    sw s2, 20(sp) # 4-byte Folded Spill
-; RV32ZVBC32-NEXT:    sw s3, 16(sp) # 4-byte Folded Spill
-; RV32ZVBC32-NEXT:    sw s4, 12(sp) # 4-byte Folded Spill
-; RV32ZVBC32-NEXT:    sw s5, 8(sp) # 4-byte Folded Spill
-; RV32ZVBC32-NEXT:    sw s6, 4(sp) # 4-byte Folded Spill
-; RV32ZVBC32-NEXT:    sw s7, 0(sp) # 4-byte Folded Spill
-; RV32ZVBC32-NEXT:    andi s6, a0, 2
-; RV32ZVBC32-NEXT:    andi s5, a0, 1
-; RV32ZVBC32-NEXT:    andi s3, a0, 4
-; RV32ZVBC32-NEXT:    andi s1, a0, 8
-; RV32ZVBC32-NEXT:    andi t6, a0, 16
-; RV32ZVBC32-NEXT:    andi t4, a0, 32
-; RV32ZVBC32-NEXT:    andi t2, a0, 64
-; RV32ZVBC32-NEXT:    andi t0, a0, 128
-; RV32ZVBC32-NEXT:    andi a6, a0, 256
-; RV32ZVBC32-NEXT:    andi a4, a0, 512
-; RV32ZVBC32-NEXT:    andi a2, a0, 1024
-; RV32ZVBC32-NEXT:    li a1, 1
-; RV32ZVBC32-NEXT:    lui a3, 1
-; RV32ZVBC32-NEXT:    lui a5, 2
-; RV32ZVBC32-NEXT:    lui a7, 4
-; RV32ZVBC32-NEXT:    lui t1, 8
-; RV32ZVBC32-NEXT:    lui t3, 16
-; RV32ZVBC32-NEXT:    lui t5, 32
-; RV32ZVBC32-NEXT:    lui s0, 64
-; RV32ZVBC32-NEXT:    lui s2, 128
-; RV32ZVBC32-NEXT:    lui s4, 256
-; RV32ZVBC32-NEXT:    vsetvli s7, zero, e32, m1, ta, ma
-; RV32ZVBC32-NEXT:    vmul.vx v9, v8, s6
-; RV32ZVBC32-NEXT:    lui s6, 512
-; RV32ZVBC32-NEXT:    vmul.vx v10, v8, s5
-; RV32ZVBC32-NEXT:    lui s5, 1024
-; RV32ZVBC32-NEXT:    vxor.vv v9, v10, v9
-; RV32ZVBC32-NEXT:    vmul.vx v10, v8, s3
-; RV32ZVBC32-NEXT:    lui s3, 2048
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
-; RV32ZVBC32-NEXT:    vmul.vx v10, v8, s1
-; RV32ZVBC32-NEXT:    lui s1, 4096
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
-; RV32ZVBC32-NEXT:    vmul.vx v10, v8, t6
-; RV32ZVBC32-NEXT:    lui t6, 8192
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
-; RV32ZVBC32-NEXT:    vmul.vx v10, v8, t4
-; RV32ZVBC32-NEXT:    lui t4, 16384
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
-; RV32ZVBC32-NEXT:    vmul.vx v10, v8, t2
-; RV32ZVBC32-NEXT:    lui t2, 32768
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
-; RV32ZVBC32-NEXT:    vmul.vx v10, v8, t0
-; RV32ZVBC32-NEXT:    lui t0, 65536
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
-; RV32ZVBC32-NEXT:    vmul.vx v10, v8, a6
-; RV32ZVBC32-NEXT:    lui a6, 131072
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
-; RV32ZVBC32-NEXT:    vmul.vx v10, v8, a4
-; RV32ZVBC32-NEXT:    lui a4, 262144
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
-; RV32ZVBC32-NEXT:    vmul.vx v10, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 524288
-; RV32ZVBC32-NEXT:    slli a1, a1, 11
-; RV32ZVBC32-NEXT:    and a3, a0, a3
-; RV32ZVBC32-NEXT:    and a5, a0, a5
-; RV32ZVBC32-NEXT:    and a7, a0, a7
-; RV32ZVBC32-NEXT:    and t1, a0, t1
-; RV32ZVBC32-NEXT:    and t3, a0, t3
-; RV32ZVBC32-NEXT:    and t5, a0, t5
-; RV32ZVBC32-NEXT:    and s0, a0, s0
-; RV32ZVBC32-NEXT:    and s2, a0, s2
-; RV32ZVBC32-NEXT:    and s4, a0, s4
-; RV32ZVBC32-NEXT:    and s6, a0, s6
-; RV32ZVBC32-NEXT:    and s5, a0, s5
-; RV32ZVBC32-NEXT:    and s3, a0, s3
-; RV32ZVBC32-NEXT:    and s1, a0, s1
-; RV32ZVBC32-NEXT:    and t6, a0, t6
-; RV32ZVBC32-NEXT:    and t4, a0, t4
-; RV32ZVBC32-NEXT:    and t2, a0, t2
-; RV32ZVBC32-NEXT:    and t0, a0, t0
-; RV32ZVBC32-NEXT:    and a6, a0, a6
-; RV32ZVBC32-NEXT:    and a4, a0, a4
-; RV32ZVBC32-NEXT:    and a2, a0, a2
-; RV32ZVBC32-NEXT:    and a0, a0, a1
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
-; RV32ZVBC32-NEXT:    vmul.vx v10, v8, a0
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
-; RV32ZVBC32-NEXT:    vmul.vx v10, v8, a3
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
-; RV32ZVBC32-NEXT:    vmul.vx v10, v8, a5
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
-; RV32ZVBC32-NEXT:    vmul.vx v10, v8, a7
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
-; RV32ZVBC32-NEXT:    vmul.vx v10, v8, t1
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
-; RV32ZVBC32-NEXT:    vmul.vx v10, v8, t3
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
-; RV32ZVBC32-NEXT:    vmul.vx v10, v8, t5
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
-; RV32ZVBC32-NEXT:    vmul.vx v10, v8, s0
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
-; RV32ZVBC32-NEXT:    vmul.vx v10, v8, s2
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
-; RV32ZVBC32-NEXT:    vmul.vx v10, v8, s4
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
-; RV32ZVBC32-NEXT:    vmul.vx v10, v8, s6
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
-; RV32ZVBC32-NEXT:    vmul.vx v10, v8, s5
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
-; RV32ZVBC32-NEXT:    vmul.vx v10, v8, s3
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
-; RV32ZVBC32-NEXT:    vmul.vx v10, v8, s1
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
-; RV32ZVBC32-NEXT:    vmul.vx v10, v8, t6
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
-; RV32ZVBC32-NEXT:    vmul.vx v10, v8, t4
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
-; RV32ZVBC32-NEXT:    vmul.vx v10, v8, t2
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
-; RV32ZVBC32-NEXT:    vmul.vx v10, v8, t0
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
-; RV32ZVBC32-NEXT:    vmul.vx v10, v8, a6
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
-; RV32ZVBC32-NEXT:    vmul.vx v10, v8, a4
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v10
-; RV32ZVBC32-NEXT:    vmul.vx v8, v8, a2
-; RV32ZVBC32-NEXT:    vxor.vv v8, v9, v8
-; RV32ZVBC32-NEXT:    lw s0, 28(sp) # 4-byte Folded Reload
-; RV32ZVBC32-NEXT:    lw s1, 24(sp) # 4-byte Folded Reload
-; RV32ZVBC32-NEXT:    lw s2, 20(sp) # 4-byte Folded Reload
-; RV32ZVBC32-NEXT:    lw s3, 16(sp) # 4-byte Folded Reload
-; RV32ZVBC32-NEXT:    lw s4, 12(sp) # 4-byte Folded Reload
-; RV32ZVBC32-NEXT:    lw s5, 8(sp) # 4-byte Folded Reload
-; RV32ZVBC32-NEXT:    lw s6, 4(sp) # 4-byte Folded Reload
-; RV32ZVBC32-NEXT:    lw s7, 0(sp) # 4-byte Folded Reload
-; RV32ZVBC32-NEXT:    addi sp, sp, 32
+; RV32ZVBC32-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; RV32ZVBC32-NEXT:    vclmul.vv v8, v8, v12
 ; RV32ZVBC32-NEXT:    ret
 ;
-; RV64ZVBC32-LABEL: clmul_nxv2i32_vx:
+; RV64ZVBC32-LABEL: clmul_nxv16i16_vv:
 ; RV64ZVBC32:       # %bb.0:
-; RV64ZVBC32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV64ZVBC32-NEXT:    vmv.v.x v9, a0
-; RV64ZVBC32-NEXT:    li a0, 16
-; RV64ZVBC32-NEXT:    vand.vi v10, v9, 2
-; RV64ZVBC32-NEXT:    vand.vi v11, v9, 1
-; RV64ZVBC32-NEXT:    vmul.vv v10, v8, v10
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v11, v10
-; RV64ZVBC32-NEXT:    vand.vi v11, v9, 4
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vi v11, v9, 8
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 32
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 64
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 128
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 256
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 512
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 1024
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    li a0, 1
-; RV64ZVBC32-NEXT:    slli a0, a0, 11
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 1
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 2
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 4
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 8
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 16
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 32
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 64
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 128
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 256
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 512
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 1024
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 2048
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 4096
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 8192
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 16384
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 32768
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 65536
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 131072
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 262144
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v9, a0
-; RV64ZVBC32-NEXT:    lui a0, 524288
-; RV64ZVBC32-NEXT:    vand.vx v9, v9, a0
-; RV64ZVBC32-NEXT:    vmul.vv v11, v8, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vmul.vv v8, v8, v9
-; RV64ZVBC32-NEXT:    vxor.vv v8, v10, v8
+; RV64ZVBC32-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; RV64ZVBC32-NEXT:    vclmul.vv v8, v8, v12
 ; RV64ZVBC32-NEXT:    ret
-  %elt.head = insertelement <vscale x 2 x i32> poison, i32 %b, i32 0
-  %vb = shufflevector <vscale x 2 x i32> %elt.head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
-  %v = call <vscale x 2 x i32> @llvm.clmul.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb)
-  ret <vscale x 2 x i32> %v
+  %v = call <vscale x 16 x i16> @llvm.clmul.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %vb)
+  ret <vscale x 16 x i16> %v
 }
 
-define <vscale x 4 x i32> @clmul_nxv4i32_vv(<vscale x 4 x i32> %va, <vscale x 4 x i32> %vb) nounwind {
-; RV32V-LABEL: clmul_nxv4i32_vv:
+define <vscale x 16 x i16> @clmul_nxv16i16_vx(<vscale x 16 x i16> %va, i16 %b) nounwind {
+; RV32V-LABEL: clmul_nxv16i16_vx:
 ; RV32V:       # %bb.0:
-; RV32V-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; RV32V-NEXT:    vand.vi v12, v10, 2
-; RV32V-NEXT:    vand.vi v14, v10, 1
-; RV32V-NEXT:    vmul.vv v12, v8, v12
-; RV32V-NEXT:    vmul.vv v14, v8, v14
-; RV32V-NEXT:    vxor.vv v12, v14, v12
-; RV32V-NEXT:    vand.vi v14, v10, 4
-; RV32V-NEXT:    vmul.vv v14, v8, v14
-; RV32V-NEXT:    vxor.vv v12, v12, v14
-; RV32V-NEXT:    vand.vi v14, v10, 8
+; RV32V-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; RV32V-NEXT:    vmv.v.x v12, a0
 ; RV32V-NEXT:    li a0, 16
-; RV32V-NEXT:    vmul.vv v14, v8, v14
-; RV32V-NEXT:    vxor.vv v12, v12, v14
-; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    vand.vi v16, v12, 2
+; RV32V-NEXT:    vand.vi v20, v12, 1
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v20, v16
+; RV32V-NEXT:    vand.vi v20, v12, 4
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vi v20, v12, 8
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
 ; RV32V-NEXT:    li a0, 32
-; RV32V-NEXT:    vmul.vv v14, v8, v14
-; RV32V-NEXT:    vxor.vv v12, v12, v14
-; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
 ; RV32V-NEXT:    li a0, 64
-; RV32V-NEXT:    vmul.vv v14, v8, v14
-; RV32V-NEXT:    vxor.vv v12, v12, v14
-; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
 ; RV32V-NEXT:    li a0, 128
-; RV32V-NEXT:    vmul.vv v14, v8, v14
-; RV32V-NEXT:    vxor.vv v12, v12, v14
-; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
 ; RV32V-NEXT:    li a0, 256
-; RV32V-NEXT:    vmul.vv v14, v8, v14
-; RV32V-NEXT:    vxor.vv v12, v12, v14
-; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
 ; RV32V-NEXT:    li a0, 512
-; RV32V-NEXT:    vmul.vv v14, v8, v14
-; RV32V-NEXT:    vxor.vv v12, v12, v14
-; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
 ; RV32V-NEXT:    li a0, 1024
-; RV32V-NEXT:    vmul.vv v14, v8, v14
-; RV32V-NEXT:    vxor.vv v12, v12, v14
-; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
 ; RV32V-NEXT:    li a0, 1
 ; RV32V-NEXT:    slli a0, a0, 11
-; RV32V-NEXT:    vmul.vv v14, v8, v14
-; RV32V-NEXT:    vxor.vv v12, v12, v14
-; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
 ; RV32V-NEXT:    lui a0, 1
-; RV32V-NEXT:    vmul.vv v14, v8, v14
-; RV32V-NEXT:    vxor.vv v12, v12, v14
-; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
 ; RV32V-NEXT:    lui a0, 2
-; RV32V-NEXT:    vmul.vv v14, v8, v14
-; RV32V-NEXT:    vxor.vv v12, v12, v14
-; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
 ; RV32V-NEXT:    lui a0, 4
-; RV32V-NEXT:    vmul.vv v14, v8, v14
-; RV32V-NEXT:    vxor.vv v12, v12, v14
-; RV32V-NEXT:    vand.vx v14, v10, a0
-; RV32V-NEXT:    lui a0, 8
-; RV32V-NEXT:    vmul.vv v14, v8, v14
-; RV32V-NEXT:    vxor.vv v12, v12, v14
-; RV32V-NEXT:    vand.vx v14, v10, a0
-; RV32V-NEXT:    lui a0, 16
-; RV32V-NEXT:    vmul.vv v14, v8, v14
-; RV32V-NEXT:    vxor.vv v12, v12, v14
-; RV32V-NEXT:    vand.vx v14, v10, a0
-; RV32V-NEXT:    lui a0, 32
-; RV32V-NEXT:    vmul.vv v14, v8, v14
-; RV32V-NEXT:    vxor.vv v12, v12, v14
-; RV32V-NEXT:    vand.vx v14, v10, a0
-; RV32V-NEXT:    lui a0, 64
-; RV32V-NEXT:    vmul.vv v14, v8, v14
-; RV32V-NEXT:    vxor.vv v12, v12, v14
-; RV32V-NEXT:    vand.vx v14, v10, a0
-; RV32V-NEXT:    lui a0, 128
-; RV32V-NEXT:    vmul.vv v14, v8, v14
-; RV32V-NEXT:    vxor.vv v12, v12, v14
-; RV32V-NEXT:    vand.vx v14, v10, a0
-; RV32V-NEXT:    lui a0, 256
-; RV32V-NEXT:    vmul.vv v14, v8, v14
-; RV32V-NEXT:    vxor.vv v12, v12, v14
-; RV32V-NEXT:    vand.vx v14, v10, a0
-; RV32V-NEXT:    lui a0, 512
-; RV32V-NEXT:    vmul.vv v14, v8, v14
-; RV32V-NEXT:    vxor.vv v12, v12, v14
-; RV32V-NEXT:    vand.vx v14, v10, a0
-; RV32V-NEXT:    lui a0, 1024
-; RV32V-NEXT:    vmul.vv v14, v8, v14
-; RV32V-NEXT:    vxor.vv v12, v12, v14
-; RV32V-NEXT:    vand.vx v14, v10, a0
-; RV32V-NEXT:    lui a0, 2048
-; RV32V-NEXT:    vmul.vv v14, v8, v14
-; RV32V-NEXT:    vxor.vv v12, v12, v14
-; RV32V-NEXT:    vand.vx v14, v10, a0
-; RV32V-NEXT:    lui a0, 4096
-; RV32V-NEXT:    vmul.vv v14, v8, v14
-; RV32V-NEXT:    vxor.vv v12, v12, v14
-; RV32V-NEXT:    vand.vx v14, v10, a0
-; RV32V-NEXT:    lui a0, 8192
-; RV32V-NEXT:    vmul.vv v14, v8, v14
-; RV32V-NEXT:    vxor.vv v12, v12, v14
-; RV32V-NEXT:    vand.vx v14, v10, a0
-; RV32V-NEXT:    lui a0, 16384
-; RV32V-NEXT:    vmul.vv v14, v8, v14
-; RV32V-NEXT:    vxor.vv v12, v12, v14
-; RV32V-NEXT:    vand.vx v14, v10, a0
-; RV32V-NEXT:    lui a0, 32768
-; RV32V-NEXT:    vmul.vv v14, v8, v14
-; RV32V-NEXT:    vxor.vv v12, v12, v14
-; RV32V-NEXT:    vand.vx v14, v10, a0
-; RV32V-NEXT:    lui a0, 65536
-; RV32V-NEXT:    vmul.vv v14, v8, v14
-; RV32V-NEXT:    vxor.vv v12, v12, v14
-; RV32V-NEXT:    vand.vx v14, v10, a0
-; RV32V-NEXT:    lui a0, 131072
-; RV32V-NEXT:    vmul.vv v14, v8, v14
-; RV32V-NEXT:    vxor.vv v12, v12, v14
-; RV32V-NEXT:    vand.vx v14, v10, a0
-; RV32V-NEXT:    lui a0, 262144
-; RV32V-NEXT:    vmul.vv v14, v8, v14
-; RV32V-NEXT:    vxor.vv v12, v12, v14
-; RV32V-NEXT:    vand.vx v14, v10, a0
-; RV32V-NEXT:    lui a0, 524288
-; RV32V-NEXT:    vand.vx v10, v10, a0
-; RV32V-NEXT:    vmul.vv v14, v8, v14
-; RV32V-NEXT:    vxor.vv v12, v12, v14
-; RV32V-NEXT:    vmul.vv v8, v8, v10
-; RV32V-NEXT:    vxor.vv v8, v12, v8
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    lui a0, 8
+; RV32V-NEXT:    vand.vx v12, v12, a0
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vmul.vv v8, v8, v12
+; RV32V-NEXT:    vxor.vv v8, v16, v8
 ; RV32V-NEXT:    ret
 ;
-; RV64V-LABEL: clmul_nxv4i32_vv:
+; RV64V-LABEL: clmul_nxv16i16_vx:
 ; RV64V:       # %bb.0:
-; RV64V-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; RV64V-NEXT:    vand.vi v12, v10, 2
-; RV64V-NEXT:    vand.vi v14, v10, 1
-; RV64V-NEXT:    vmul.vv v12, v8, v12
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v14, v12
-; RV64V-NEXT:    vand.vi v14, v10, 4
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vi v14, v10, 8
+; RV64V-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; RV64V-NEXT:    vmv.v.x v12, a0
 ; RV64V-NEXT:    li a0, 16
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    vand.vi v16, v12, 2
+; RV64V-NEXT:    vand.vi v20, v12, 1
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v20, v16
+; RV64V-NEXT:    vand.vi v20, v12, 4
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vi v20, v12, 8
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
 ; RV64V-NEXT:    li a0, 32
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
 ; RV64V-NEXT:    li a0, 64
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
 ; RV64V-NEXT:    li a0, 128
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
 ; RV64V-NEXT:    li a0, 256
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
 ; RV64V-NEXT:    li a0, 512
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
 ; RV64V-NEXT:    li a0, 1024
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
 ; RV64V-NEXT:    li a0, 1
 ; RV64V-NEXT:    slli a0, a0, 11
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
 ; RV64V-NEXT:    lui a0, 1
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
 ; RV64V-NEXT:    lui a0, 2
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
 ; RV64V-NEXT:    lui a0, 4
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
 ; RV64V-NEXT:    lui a0, 8
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
-; RV64V-NEXT:    lui a0, 16
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
-; RV64V-NEXT:    lui a0, 32
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
-; RV64V-NEXT:    lui a0, 64
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
-; RV64V-NEXT:    lui a0, 128
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
-; RV64V-NEXT:    lui a0, 256
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
-; RV64V-NEXT:    lui a0, 512
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
-; RV64V-NEXT:    lui a0, 1024
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
-; RV64V-NEXT:    lui a0, 2048
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
-; RV64V-NEXT:    lui a0, 4096
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
-; RV64V-NEXT:    lui a0, 8192
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
-; RV64V-NEXT:    lui a0, 16384
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
-; RV64V-NEXT:    lui a0, 32768
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
-; RV64V-NEXT:    lui a0, 65536
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
-; RV64V-NEXT:    lui a0, 131072
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
-; RV64V-NEXT:    lui a0, 262144
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
-; RV64V-NEXT:    lui a0, 524288
-; RV64V-NEXT:    vand.vx v10, v10, a0
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vmul.vv v8, v8, v10
-; RV64V-NEXT:    vxor.vv v8, v12, v8
+; RV64V-NEXT:    vand.vx v12, v12, a0
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vmul.vv v8, v8, v12
+; RV64V-NEXT:    vxor.vv v8, v16, v8
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC64-LABEL: clmul_nxv4i32_vv:
+; RV32ZVBC64-LABEL: clmul_nxv16i16_vx:
 ; RV32ZVBC64:       # %bb.0:
-; RV32ZVBC64-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV32ZVBC64-NEXT:    vzext.vf2 v12, v10
-; RV32ZVBC64-NEXT:    vzext.vf2 v16, v8
-; RV32ZVBC64-NEXT:    vclmul.vv v12, v16, v12
-; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV32ZVBC64-NEXT:    vnsrl.wi v8, v12, 0
+; RV32ZVBC64-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; RV32ZVBC64-NEXT:    vmv.v.x v12, a0
+; RV32ZVBC64-NEXT:    li a0, 16
+; RV32ZVBC64-NEXT:    vand.vi v16, v12, 2
+; RV32ZVBC64-NEXT:    vand.vi v20, v12, 1
+; RV32ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC64-NEXT:    vxor.vv v16, v20, v16
+; RV32ZVBC64-NEXT:    vand.vi v20, v12, 4
+; RV32ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC64-NEXT:    vand.vi v20, v12, 8
+; RV32ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC64-NEXT:    vand.vx v20, v12, a0
+; RV32ZVBC64-NEXT:    li a0, 32
+; RV32ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC64-NEXT:    vand.vx v20, v12, a0
+; RV32ZVBC64-NEXT:    li a0, 64
+; RV32ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC64-NEXT:    vand.vx v20, v12, a0
+; RV32ZVBC64-NEXT:    li a0, 128
+; RV32ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC64-NEXT:    vand.vx v20, v12, a0
+; RV32ZVBC64-NEXT:    li a0, 256
+; RV32ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC64-NEXT:    vand.vx v20, v12, a0
+; RV32ZVBC64-NEXT:    li a0, 512
+; RV32ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC64-NEXT:    vand.vx v20, v12, a0
+; RV32ZVBC64-NEXT:    li a0, 1024
+; RV32ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC64-NEXT:    vand.vx v20, v12, a0
+; RV32ZVBC64-NEXT:    li a0, 1
+; RV32ZVBC64-NEXT:    slli a0, a0, 11
+; RV32ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC64-NEXT:    vand.vx v20, v12, a0
+; RV32ZVBC64-NEXT:    lui a0, 1
+; RV32ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC64-NEXT:    vand.vx v20, v12, a0
+; RV32ZVBC64-NEXT:    lui a0, 2
+; RV32ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC64-NEXT:    vand.vx v20, v12, a0
+; RV32ZVBC64-NEXT:    lui a0, 4
+; RV32ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC64-NEXT:    vand.vx v20, v12, a0
+; RV32ZVBC64-NEXT:    lui a0, 8
+; RV32ZVBC64-NEXT:    vand.vx v12, v12, a0
+; RV32ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC64-NEXT:    vmul.vv v8, v8, v12
+; RV32ZVBC64-NEXT:    vxor.vv v8, v16, v8
 ; RV32ZVBC64-NEXT:    ret
 ;
-; RV64ZVBC64-LABEL: clmul_nxv4i32_vv:
+; RV64ZVBC64-LABEL: clmul_nxv16i16_vx:
 ; RV64ZVBC64:       # %bb.0:
-; RV64ZVBC64-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV64ZVBC64-NEXT:    vzext.vf2 v12, v10
-; RV64ZVBC64-NEXT:    vzext.vf2 v16, v8
-; RV64ZVBC64-NEXT:    vclmul.vv v12, v16, v12
-; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV64ZVBC64-NEXT:    vnsrl.wi v8, v12, 0
+; RV64ZVBC64-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; RV64ZVBC64-NEXT:    vmv.v.x v12, a0
+; RV64ZVBC64-NEXT:    li a0, 16
+; RV64ZVBC64-NEXT:    vand.vi v16, v12, 2
+; RV64ZVBC64-NEXT:    vand.vi v20, v12, 1
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC64-NEXT:    vxor.vv v16, v20, v16
+; RV64ZVBC64-NEXT:    vand.vi v20, v12, 4
+; RV64ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC64-NEXT:    vand.vi v20, v12, 8
+; RV64ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC64-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC64-NEXT:    li a0, 32
+; RV64ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC64-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC64-NEXT:    li a0, 64
+; RV64ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC64-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC64-NEXT:    li a0, 128
+; RV64ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC64-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC64-NEXT:    li a0, 256
+; RV64ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC64-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC64-NEXT:    li a0, 512
+; RV64ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC64-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC64-NEXT:    li a0, 1024
+; RV64ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC64-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC64-NEXT:    li a0, 1
+; RV64ZVBC64-NEXT:    slli a0, a0, 11
+; RV64ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC64-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC64-NEXT:    lui a0, 1
+; RV64ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC64-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC64-NEXT:    lui a0, 2
+; RV64ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC64-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC64-NEXT:    lui a0, 4
+; RV64ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC64-NEXT:    vand.vx v20, v12, a0
+; RV64ZVBC64-NEXT:    lui a0, 8
+; RV64ZVBC64-NEXT:    vand.vx v12, v12, a0
+; RV64ZVBC64-NEXT:    vmul.vv v20, v8, v20
+; RV64ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC64-NEXT:    vmul.vv v8, v8, v12
+; RV64ZVBC64-NEXT:    vxor.vv v8, v16, v8
 ; RV64ZVBC64-NEXT:    ret
 ;
-; RV32ZVBC32-LABEL: clmul_nxv4i32_vv:
+; RV32ZVBC32-LABEL: clmul_nxv16i16_vx:
 ; RV32ZVBC32:       # %bb.0:
-; RV32ZVBC32-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; RV32ZVBC32-NEXT:    vand.vi v12, v10, 2
-; RV32ZVBC32-NEXT:    vand.vi v14, v10, 1
-; RV32ZVBC32-NEXT:    vmul.vv v12, v8, v12
-; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v14, v12
-; RV32ZVBC32-NEXT:    vand.vi v14, v10, 4
-; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vi v14, v10, 8
-; RV32ZVBC32-NEXT:    li a0, 16
-; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV32ZVBC32-NEXT:    li a0, 32
-; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV32ZVBC32-NEXT:    li a0, 64
-; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV32ZVBC32-NEXT:    li a0, 128
-; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV32ZVBC32-NEXT:    li a0, 256
-; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV32ZVBC32-NEXT:    li a0, 512
-; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV32ZVBC32-NEXT:    li a0, 1024
-; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV32ZVBC32-NEXT:    li a0, 1
-; RV32ZVBC32-NEXT:    slli a0, a0, 11
-; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV32ZVBC32-NEXT:    lui a0, 1
-; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV32ZVBC32-NEXT:    lui a0, 2
-; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV32ZVBC32-NEXT:    lui a0, 4
-; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV32ZVBC32-NEXT:    lui a0, 8
-; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV32ZVBC32-NEXT:    lui a0, 16
-; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV32ZVBC32-NEXT:    lui a0, 32
-; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV32ZVBC32-NEXT:    lui a0, 64
-; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV32ZVBC32-NEXT:    lui a0, 128
-; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV32ZVBC32-NEXT:    lui a0, 256
-; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV32ZVBC32-NEXT:    lui a0, 512
-; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV32ZVBC32-NEXT:    lui a0, 1024
-; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV32ZVBC32-NEXT:    lui a0, 2048
-; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV32ZVBC32-NEXT:    lui a0, 4096
-; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV32ZVBC32-NEXT:    lui a0, 8192
-; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV32ZVBC32-NEXT:    lui a0, 16384
-; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV32ZVBC32-NEXT:    lui a0, 32768
-; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV32ZVBC32-NEXT:    lui a0, 65536
-; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV32ZVBC32-NEXT:    lui a0, 131072
-; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV32ZVBC32-NEXT:    lui a0, 262144
-; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV32ZVBC32-NEXT:    lui a0, 524288
-; RV32ZVBC32-NEXT:    vand.vx v10, v10, a0
-; RV32ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vmul.vv v8, v8, v10
-; RV32ZVBC32-NEXT:    vxor.vv v8, v12, v8
+; RV32ZVBC32-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; RV32ZVBC32-NEXT:    vclmul.vx v8, v8, a0
 ; RV32ZVBC32-NEXT:    ret
 ;
-; RV64ZVBC32-LABEL: clmul_nxv4i32_vv:
+; RV64ZVBC32-LABEL: clmul_nxv16i16_vx:
 ; RV64ZVBC32:       # %bb.0:
-; RV64ZVBC32-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; RV64ZVBC32-NEXT:    vand.vi v12, v10, 2
-; RV64ZVBC32-NEXT:    vand.vi v14, v10, 1
-; RV64ZVBC32-NEXT:    vmul.vv v12, v8, v12
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v14, v12
-; RV64ZVBC32-NEXT:    vand.vi v14, v10, 4
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vi v14, v10, 8
-; RV64ZVBC32-NEXT:    li a0, 16
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    li a0, 32
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    li a0, 64
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    li a0, 128
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    li a0, 256
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    li a0, 512
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    li a0, 1024
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    li a0, 1
-; RV64ZVBC32-NEXT:    slli a0, a0, 11
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    lui a0, 1
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    lui a0, 2
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    lui a0, 4
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    lui a0, 8
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    lui a0, 16
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    lui a0, 32
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    lui a0, 64
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    lui a0, 128
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    lui a0, 256
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    lui a0, 512
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    lui a0, 1024
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    lui a0, 2048
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    lui a0, 4096
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    lui a0, 8192
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    lui a0, 16384
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    lui a0, 32768
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    lui a0, 65536
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    lui a0, 131072
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    lui a0, 262144
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    lui a0, 524288
-; RV64ZVBC32-NEXT:    vand.vx v10, v10, a0
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vmul.vv v8, v8, v10
-; RV64ZVBC32-NEXT:    vxor.vv v8, v12, v8
+; RV64ZVBC32-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; RV64ZVBC32-NEXT:    vclmul.vx v8, v8, a0
 ; RV64ZVBC32-NEXT:    ret
-  %v = call <vscale x 4 x i32> @llvm.clmul.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %vb)
-  ret <vscale x 4 x i32> %v
+  %elt.head = insertelement <vscale x 16 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i16> %elt.head, <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i16> @llvm.clmul.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %vb)
+  ret <vscale x 16 x i16> %v
 }
 
-define <vscale x 4 x i32> @clmul_nxv4i32_vx(<vscale x 4 x i32> %va, i32 %b) nounwind {
-; RV32V-LABEL: clmul_nxv4i32_vx:
+define <vscale x 32 x i16> @clmul_nxv32i16_vv(<vscale x 32 x i16> %va, <vscale x 32 x i16> %vb) nounwind {
+; RV32V-LABEL: clmul_nxv32i16_vv:
 ; RV32V:       # %bb.0:
-; RV32V-NEXT:    addi sp, sp, -32
-; RV32V-NEXT:    sw s0, 28(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s1, 24(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s2, 20(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s3, 16(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s4, 12(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s5, 8(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s6, 4(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s7, 0(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    andi s6, a0, 2
-; RV32V-NEXT:    andi s5, a0, 1
-; RV32V-NEXT:    andi s3, a0, 4
-; RV32V-NEXT:    andi s1, a0, 8
-; RV32V-NEXT:    andi t6, a0, 16
-; RV32V-NEXT:    andi t4, a0, 32
-; RV32V-NEXT:    andi t2, a0, 64
-; RV32V-NEXT:    andi t0, a0, 128
-; RV32V-NEXT:    andi a6, a0, 256
-; RV32V-NEXT:    andi a4, a0, 512
-; RV32V-NEXT:    andi a2, a0, 1024
-; RV32V-NEXT:    li a1, 1
-; RV32V-NEXT:    lui a3, 1
-; RV32V-NEXT:    lui a5, 2
-; RV32V-NEXT:    lui a7, 4
-; RV32V-NEXT:    lui t1, 8
-; RV32V-NEXT:    lui t3, 16
-; RV32V-NEXT:    lui t5, 32
-; RV32V-NEXT:    lui s0, 64
-; RV32V-NEXT:    lui s2, 128
-; RV32V-NEXT:    lui s4, 256
-; RV32V-NEXT:    vsetvli s7, zero, e32, m2, ta, ma
-; RV32V-NEXT:    vmul.vx v10, v8, s6
-; RV32V-NEXT:    lui s6, 512
-; RV32V-NEXT:    vmul.vx v12, v8, s5
-; RV32V-NEXT:    lui s5, 1024
-; RV32V-NEXT:    vxor.vv v10, v12, v10
-; RV32V-NEXT:    vmul.vx v12, v8, s3
-; RV32V-NEXT:    lui s3, 2048
-; RV32V-NEXT:    vxor.vv v10, v10, v12
-; RV32V-NEXT:    vmul.vx v12, v8, s1
-; RV32V-NEXT:    lui s1, 4096
-; RV32V-NEXT:    vxor.vv v10, v10, v12
-; RV32V-NEXT:    vmul.vx v12, v8, t6
-; RV32V-NEXT:    lui t6, 8192
-; RV32V-NEXT:    vxor.vv v10, v10, v12
-; RV32V-NEXT:    vmul.vx v12, v8, t4
-; RV32V-NEXT:    lui t4, 16384
-; RV32V-NEXT:    vxor.vv v10, v10, v12
-; RV32V-NEXT:    vmul.vx v12, v8, t2
-; RV32V-NEXT:    lui t2, 32768
-; RV32V-NEXT:    vxor.vv v10, v10, v12
-; RV32V-NEXT:    vmul.vx v12, v8, t0
-; RV32V-NEXT:    lui t0, 65536
-; RV32V-NEXT:    vxor.vv v10, v10, v12
-; RV32V-NEXT:    vmul.vx v12, v8, a6
-; RV32V-NEXT:    lui a6, 131072
-; RV32V-NEXT:    vxor.vv v10, v10, v12
-; RV32V-NEXT:    vmul.vx v12, v8, a4
-; RV32V-NEXT:    lui a4, 262144
-; RV32V-NEXT:    vxor.vv v10, v10, v12
-; RV32V-NEXT:    vmul.vx v12, v8, a2
-; RV32V-NEXT:    lui a2, 524288
-; RV32V-NEXT:    slli a1, a1, 11
-; RV32V-NEXT:    and a3, a0, a3
-; RV32V-NEXT:    and a5, a0, a5
-; RV32V-NEXT:    and a7, a0, a7
-; RV32V-NEXT:    and t1, a0, t1
-; RV32V-NEXT:    and t3, a0, t3
-; RV32V-NEXT:    and t5, a0, t5
-; RV32V-NEXT:    and s0, a0, s0
-; RV32V-NEXT:    and s2, a0, s2
-; RV32V-NEXT:    and s4, a0, s4
-; RV32V-NEXT:    and s6, a0, s6
-; RV32V-NEXT:    and s5, a0, s5
-; RV32V-NEXT:    and s3, a0, s3
-; RV32V-NEXT:    and s1, a0, s1
-; RV32V-NEXT:    and t6, a0, t6
-; RV32V-NEXT:    and t4, a0, t4
-; RV32V-NEXT:    and t2, a0, t2
-; RV32V-NEXT:    and t0, a0, t0
-; RV32V-NEXT:    and a6, a0, a6
-; RV32V-NEXT:    and a4, a0, a4
-; RV32V-NEXT:    and a2, a0, a2
-; RV32V-NEXT:    and a0, a0, a1
-; RV32V-NEXT:    vxor.vv v10, v10, v12
-; RV32V-NEXT:    vmul.vx v12, v8, a0
-; RV32V-NEXT:    vxor.vv v10, v10, v12
-; RV32V-NEXT:    vmul.vx v12, v8, a3
-; RV32V-NEXT:    vxor.vv v10, v10, v12
-; RV32V-NEXT:    vmul.vx v12, v8, a5
-; RV32V-NEXT:    vxor.vv v10, v10, v12
-; RV32V-NEXT:    vmul.vx v12, v8, a7
-; RV32V-NEXT:    vxor.vv v10, v10, v12
-; RV32V-NEXT:    vmul.vx v12, v8, t1
-; RV32V-NEXT:    vxor.vv v10, v10, v12
-; RV32V-NEXT:    vmul.vx v12, v8, t3
-; RV32V-NEXT:    vxor.vv v10, v10, v12
-; RV32V-NEXT:    vmul.vx v12, v8, t5
-; RV32V-NEXT:    vxor.vv v10, v10, v12
-; RV32V-NEXT:    vmul.vx v12, v8, s0
-; RV32V-NEXT:    vxor.vv v10, v10, v12
-; RV32V-NEXT:    vmul.vx v12, v8, s2
-; RV32V-NEXT:    vxor.vv v10, v10, v12
-; RV32V-NEXT:    vmul.vx v12, v8, s4
-; RV32V-NEXT:    vxor.vv v10, v10, v12
-; RV32V-NEXT:    vmul.vx v12, v8, s6
-; RV32V-NEXT:    vxor.vv v10, v10, v12
-; RV32V-NEXT:    vmul.vx v12, v8, s5
-; RV32V-NEXT:    vxor.vv v10, v10, v12
-; RV32V-NEXT:    vmul.vx v12, v8, s3
-; RV32V-NEXT:    vxor.vv v10, v10, v12
-; RV32V-NEXT:    vmul.vx v12, v8, s1
-; RV32V-NEXT:    vxor.vv v10, v10, v12
-; RV32V-NEXT:    vmul.vx v12, v8, t6
-; RV32V-NEXT:    vxor.vv v10, v10, v12
-; RV32V-NEXT:    vmul.vx v12, v8, t4
-; RV32V-NEXT:    vxor.vv v10, v10, v12
-; RV32V-NEXT:    vmul.vx v12, v8, t2
-; RV32V-NEXT:    vxor.vv v10, v10, v12
-; RV32V-NEXT:    vmul.vx v12, v8, t0
-; RV32V-NEXT:    vxor.vv v10, v10, v12
-; RV32V-NEXT:    vmul.vx v12, v8, a6
-; RV32V-NEXT:    vxor.vv v10, v10, v12
-; RV32V-NEXT:    vmul.vx v12, v8, a4
-; RV32V-NEXT:    vxor.vv v10, v10, v12
-; RV32V-NEXT:    vmul.vx v8, v8, a2
-; RV32V-NEXT:    vxor.vv v8, v10, v8
-; RV32V-NEXT:    lw s0, 28(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s1, 24(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s2, 20(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s3, 16(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s4, 12(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s5, 8(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s6, 4(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s7, 0(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    addi sp, sp, 32
+; RV32V-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
+; RV32V-NEXT:    vand.vi v24, v16, 2
+; RV32V-NEXT:    vand.vi v0, v16, 1
+; RV32V-NEXT:    vmul.vv v24, v8, v24
+; RV32V-NEXT:    vmul.vv v0, v8, v0
+; RV32V-NEXT:    vxor.vv v24, v0, v24
+; RV32V-NEXT:    vand.vi v0, v16, 4
+; RV32V-NEXT:    vmul.vv v0, v8, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vi v0, v16, 8
+; RV32V-NEXT:    li a0, 16
+; RV32V-NEXT:    vmul.vv v0, v8, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v16, a0
+; RV32V-NEXT:    li a0, 32
+; RV32V-NEXT:    vmul.vv v0, v8, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v16, a0
+; RV32V-NEXT:    li a0, 64
+; RV32V-NEXT:    vmul.vv v0, v8, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v16, a0
+; RV32V-NEXT:    li a0, 128
+; RV32V-NEXT:    vmul.vv v0, v8, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v16, a0
+; RV32V-NEXT:    li a0, 256
+; RV32V-NEXT:    vmul.vv v0, v8, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v16, a0
+; RV32V-NEXT:    li a0, 512
+; RV32V-NEXT:    vmul.vv v0, v8, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v16, a0
+; RV32V-NEXT:    li a0, 1024
+; RV32V-NEXT:    vmul.vv v0, v8, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v16, a0
+; RV32V-NEXT:    li a0, 1
+; RV32V-NEXT:    slli a0, a0, 11
+; RV32V-NEXT:    vmul.vv v0, v8, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v16, a0
+; RV32V-NEXT:    lui a0, 1
+; RV32V-NEXT:    vmul.vv v0, v8, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v16, a0
+; RV32V-NEXT:    lui a0, 2
+; RV32V-NEXT:    vmul.vv v0, v8, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v16, a0
+; RV32V-NEXT:    lui a0, 4
+; RV32V-NEXT:    vmul.vv v0, v8, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v16, a0
+; RV32V-NEXT:    lui a0, 8
+; RV32V-NEXT:    vand.vx v16, v16, a0
+; RV32V-NEXT:    vmul.vv v0, v8, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vmul.vv v8, v8, v16
+; RV32V-NEXT:    vxor.vv v8, v24, v8
 ; RV32V-NEXT:    ret
 ;
-; RV64V-LABEL: clmul_nxv4i32_vx:
+; RV64V-LABEL: clmul_nxv32i16_vv:
 ; RV64V:       # %bb.0:
-; RV64V-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV64V-NEXT:    vmv.v.x v10, a0
+; RV64V-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
+; RV64V-NEXT:    vand.vi v24, v16, 2
+; RV64V-NEXT:    vand.vi v0, v16, 1
+; RV64V-NEXT:    vmul.vv v24, v8, v24
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v0, v24
+; RV64V-NEXT:    vand.vi v0, v16, 4
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vi v0, v16, 8
 ; RV64V-NEXT:    li a0, 16
-; RV64V-NEXT:    vand.vi v12, v10, 2
-; RV64V-NEXT:    vand.vi v14, v10, 1
-; RV64V-NEXT:    vmul.vv v12, v8, v12
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v14, v12
-; RV64V-NEXT:    vand.vi v14, v10, 4
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vi v14, v10, 8
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
 ; RV64V-NEXT:    li a0, 32
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
 ; RV64V-NEXT:    li a0, 64
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
 ; RV64V-NEXT:    li a0, 128
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
 ; RV64V-NEXT:    li a0, 256
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
 ; RV64V-NEXT:    li a0, 512
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
 ; RV64V-NEXT:    li a0, 1024
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
 ; RV64V-NEXT:    li a0, 1
 ; RV64V-NEXT:    slli a0, a0, 11
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
 ; RV64V-NEXT:    lui a0, 1
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
 ; RV64V-NEXT:    lui a0, 2
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
 ; RV64V-NEXT:    lui a0, 4
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
 ; RV64V-NEXT:    lui a0, 8
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
-; RV64V-NEXT:    lui a0, 16
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
-; RV64V-NEXT:    lui a0, 32
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
-; RV64V-NEXT:    lui a0, 64
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
-; RV64V-NEXT:    lui a0, 128
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
-; RV64V-NEXT:    lui a0, 256
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
-; RV64V-NEXT:    lui a0, 512
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
-; RV64V-NEXT:    lui a0, 1024
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
-; RV64V-NEXT:    lui a0, 2048
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
-; RV64V-NEXT:    lui a0, 4096
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
-; RV64V-NEXT:    lui a0, 8192
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
-; RV64V-NEXT:    lui a0, 16384
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
-; RV64V-NEXT:    lui a0, 32768
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
-; RV64V-NEXT:    lui a0, 65536
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
-; RV64V-NEXT:    lui a0, 131072
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
-; RV64V-NEXT:    lui a0, 262144
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v10, a0
-; RV64V-NEXT:    lui a0, 524288
-; RV64V-NEXT:    vand.vx v10, v10, a0
-; RV64V-NEXT:    vmul.vv v14, v8, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vmul.vv v8, v8, v10
-; RV64V-NEXT:    vxor.vv v8, v12, v8
+; RV64V-NEXT:    vand.vx v16, v16, a0
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vmul.vv v8, v8, v16
+; RV64V-NEXT:    vxor.vv v8, v24, v8
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC64-LABEL: clmul_nxv4i32_vx:
+; RV32ZVBC64-LABEL: clmul_nxv32i16_vv:
 ; RV32ZVBC64:       # %bb.0:
-; RV32ZVBC64-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV32ZVBC64-NEXT:    vmv.v.x v16, a0
-; RV32ZVBC64-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
-; RV32ZVBC64-NEXT:    vzext.vf2 v12, v8
-; RV32ZVBC64-NEXT:    vzext.vf2 v8, v16
-; RV32ZVBC64-NEXT:    vclmul.vv v12, v12, v8
-; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV32ZVBC64-NEXT:    vnsrl.wi v8, v12, 0
+; RV32ZVBC64-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
+; RV32ZVBC64-NEXT:    vand.vi v24, v16, 2
+; RV32ZVBC64-NEXT:    vand.vi v0, v16, 1
+; RV32ZVBC64-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v0, v24
+; RV32ZVBC64-NEXT:    vand.vi v0, v16, 4
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vi v0, v16, 8
+; RV32ZVBC64-NEXT:    li a0, 16
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV32ZVBC64-NEXT:    li a0, 32
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV32ZVBC64-NEXT:    li a0, 64
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV32ZVBC64-NEXT:    li a0, 128
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV32ZVBC64-NEXT:    li a0, 256
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV32ZVBC64-NEXT:    li a0, 512
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV32ZVBC64-NEXT:    li a0, 1024
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV32ZVBC64-NEXT:    li a0, 1
+; RV32ZVBC64-NEXT:    slli a0, a0, 11
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV32ZVBC64-NEXT:    lui a0, 1
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV32ZVBC64-NEXT:    lui a0, 2
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV32ZVBC64-NEXT:    lui a0, 4
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV32ZVBC64-NEXT:    lui a0, 8
+; RV32ZVBC64-NEXT:    vand.vx v16, v16, a0
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vmul.vv v8, v8, v16
+; RV32ZVBC64-NEXT:    vxor.vv v8, v24, v8
 ; RV32ZVBC64-NEXT:    ret
 ;
-; RV64ZVBC64-LABEL: clmul_nxv4i32_vx:
+; RV64ZVBC64-LABEL: clmul_nxv32i16_vv:
 ; RV64ZVBC64:       # %bb.0:
-; RV64ZVBC64-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV64ZVBC64-NEXT:    vmv.v.x v16, a0
-; RV64ZVBC64-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
-; RV64ZVBC64-NEXT:    vzext.vf2 v12, v8
-; RV64ZVBC64-NEXT:    vzext.vf2 v8, v16
-; RV64ZVBC64-NEXT:    vclmul.vv v12, v12, v8
-; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV64ZVBC64-NEXT:    vnsrl.wi v8, v12, 0
+; RV64ZVBC64-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
+; RV64ZVBC64-NEXT:    vand.vi v24, v16, 2
+; RV64ZVBC64-NEXT:    vand.vi v0, v16, 1
+; RV64ZVBC64-NEXT:    vmul.vv v24, v8, v24
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v0, v24
+; RV64ZVBC64-NEXT:    vand.vi v0, v16, 4
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vi v0, v16, 8
+; RV64ZVBC64-NEXT:    li a0, 16
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    li a0, 32
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    li a0, 64
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    li a0, 128
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    li a0, 256
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    li a0, 512
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    li a0, 1024
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    li a0, 1
+; RV64ZVBC64-NEXT:    slli a0, a0, 11
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    lui a0, 1
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    lui a0, 2
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    lui a0, 4
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    lui a0, 8
+; RV64ZVBC64-NEXT:    vand.vx v16, v16, a0
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vmul.vv v8, v8, v16
+; RV64ZVBC64-NEXT:    vxor.vv v8, v24, v8
 ; RV64ZVBC64-NEXT:    ret
 ;
-; RV32ZVBC32-LABEL: clmul_nxv4i32_vx:
+; RV32ZVBC32-LABEL: clmul_nxv32i16_vv:
 ; RV32ZVBC32:       # %bb.0:
-; RV32ZVBC32-NEXT:    addi sp, sp, -32
-; RV32ZVBC32-NEXT:    sw s0, 28(sp) # 4-byte Folded Spill
-; RV32ZVBC32-NEXT:    sw s1, 24(sp) # 4-byte Folded Spill
-; RV32ZVBC32-NEXT:    sw s2, 20(sp) # 4-byte Folded Spill
-; RV32ZVBC32-NEXT:    sw s3, 16(sp) # 4-byte Folded Spill
-; RV32ZVBC32-NEXT:    sw s4, 12(sp) # 4-byte Folded Spill
-; RV32ZVBC32-NEXT:    sw s5, 8(sp) # 4-byte Folded Spill
-; RV32ZVBC32-NEXT:    sw s6, 4(sp) # 4-byte Folded Spill
-; RV32ZVBC32-NEXT:    sw s7, 0(sp) # 4-byte Folded Spill
-; RV32ZVBC32-NEXT:    andi s6, a0, 2
-; RV32ZVBC32-NEXT:    andi s5, a0, 1
-; RV32ZVBC32-NEXT:    andi s3, a0, 4
-; RV32ZVBC32-NEXT:    andi s1, a0, 8
-; RV32ZVBC32-NEXT:    andi t6, a0, 16
-; RV32ZVBC32-NEXT:    andi t4, a0, 32
-; RV32ZVBC32-NEXT:    andi t2, a0, 64
-; RV32ZVBC32-NEXT:    andi t0, a0, 128
-; RV32ZVBC32-NEXT:    andi a6, a0, 256
-; RV32ZVBC32-NEXT:    andi a4, a0, 512
-; RV32ZVBC32-NEXT:    andi a2, a0, 1024
-; RV32ZVBC32-NEXT:    li a1, 1
-; RV32ZVBC32-NEXT:    lui a3, 1
-; RV32ZVBC32-NEXT:    lui a5, 2
-; RV32ZVBC32-NEXT:    lui a7, 4
-; RV32ZVBC32-NEXT:    lui t1, 8
-; RV32ZVBC32-NEXT:    lui t3, 16
-; RV32ZVBC32-NEXT:    lui t5, 32
-; RV32ZVBC32-NEXT:    lui s0, 64
-; RV32ZVBC32-NEXT:    lui s2, 128
-; RV32ZVBC32-NEXT:    lui s4, 256
-; RV32ZVBC32-NEXT:    vsetvli s7, zero, e32, m2, ta, ma
-; RV32ZVBC32-NEXT:    vmul.vx v10, v8, s6
-; RV32ZVBC32-NEXT:    lui s6, 512
-; RV32ZVBC32-NEXT:    vmul.vx v12, v8, s5
-; RV32ZVBC32-NEXT:    lui s5, 1024
-; RV32ZVBC32-NEXT:    vxor.vv v10, v12, v10
-; RV32ZVBC32-NEXT:    vmul.vx v12, v8, s3
-; RV32ZVBC32-NEXT:    lui s3, 2048
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
-; RV32ZVBC32-NEXT:    vmul.vx v12, v8, s1
-; RV32ZVBC32-NEXT:    lui s1, 4096
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
-; RV32ZVBC32-NEXT:    vmul.vx v12, v8, t6
-; RV32ZVBC32-NEXT:    lui t6, 8192
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
-; RV32ZVBC32-NEXT:    vmul.vx v12, v8, t4
-; RV32ZVBC32-NEXT:    lui t4, 16384
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
-; RV32ZVBC32-NEXT:    vmul.vx v12, v8, t2
-; RV32ZVBC32-NEXT:    lui t2, 32768
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
-; RV32ZVBC32-NEXT:    vmul.vx v12, v8, t0
-; RV32ZVBC32-NEXT:    lui t0, 65536
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
-; RV32ZVBC32-NEXT:    vmul.vx v12, v8, a6
-; RV32ZVBC32-NEXT:    lui a6, 131072
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
-; RV32ZVBC32-NEXT:    vmul.vx v12, v8, a4
-; RV32ZVBC32-NEXT:    lui a4, 262144
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
-; RV32ZVBC32-NEXT:    vmul.vx v12, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 524288
-; RV32ZVBC32-NEXT:    slli a1, a1, 11
-; RV32ZVBC32-NEXT:    and a3, a0, a3
-; RV32ZVBC32-NEXT:    and a5, a0, a5
-; RV32ZVBC32-NEXT:    and a7, a0, a7
-; RV32ZVBC32-NEXT:    and t1, a0, t1
-; RV32ZVBC32-NEXT:    and t3, a0, t3
-; RV32ZVBC32-NEXT:    and t5, a0, t5
-; RV32ZVBC32-NEXT:    and s0, a0, s0
-; RV32ZVBC32-NEXT:    and s2, a0, s2
-; RV32ZVBC32-NEXT:    and s4, a0, s4
-; RV32ZVBC32-NEXT:    and s6, a0, s6
-; RV32ZVBC32-NEXT:    and s5, a0, s5
-; RV32ZVBC32-NEXT:    and s3, a0, s3
-; RV32ZVBC32-NEXT:    and s1, a0, s1
-; RV32ZVBC32-NEXT:    and t6, a0, t6
-; RV32ZVBC32-NEXT:    and t4, a0, t4
-; RV32ZVBC32-NEXT:    and t2, a0, t2
-; RV32ZVBC32-NEXT:    and t0, a0, t0
-; RV32ZVBC32-NEXT:    and a6, a0, a6
-; RV32ZVBC32-NEXT:    and a4, a0, a4
-; RV32ZVBC32-NEXT:    and a2, a0, a2
-; RV32ZVBC32-NEXT:    and a0, a0, a1
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
-; RV32ZVBC32-NEXT:    vmul.vx v12, v8, a0
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
-; RV32ZVBC32-NEXT:    vmul.vx v12, v8, a3
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
-; RV32ZVBC32-NEXT:    vmul.vx v12, v8, a5
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
-; RV32ZVBC32-NEXT:    vmul.vx v12, v8, a7
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
-; RV32ZVBC32-NEXT:    vmul.vx v12, v8, t1
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
-; RV32ZVBC32-NEXT:    vmul.vx v12, v8, t3
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
-; RV32ZVBC32-NEXT:    vmul.vx v12, v8, t5
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
-; RV32ZVBC32-NEXT:    vmul.vx v12, v8, s0
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
-; RV32ZVBC32-NEXT:    vmul.vx v12, v8, s2
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
-; RV32ZVBC32-NEXT:    vmul.vx v12, v8, s4
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
-; RV32ZVBC32-NEXT:    vmul.vx v12, v8, s6
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
-; RV32ZVBC32-NEXT:    vmul.vx v12, v8, s5
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
-; RV32ZVBC32-NEXT:    vmul.vx v12, v8, s3
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
-; RV32ZVBC32-NEXT:    vmul.vx v12, v8, s1
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
-; RV32ZVBC32-NEXT:    vmul.vx v12, v8, t6
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
-; RV32ZVBC32-NEXT:    vmul.vx v12, v8, t4
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
-; RV32ZVBC32-NEXT:    vmul.vx v12, v8, t2
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
-; RV32ZVBC32-NEXT:    vmul.vx v12, v8, t0
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
-; RV32ZVBC32-NEXT:    vmul.vx v12, v8, a6
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
-; RV32ZVBC32-NEXT:    vmul.vx v12, v8, a4
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
-; RV32ZVBC32-NEXT:    vmul.vx v8, v8, a2
-; RV32ZVBC32-NEXT:    vxor.vv v8, v10, v8
-; RV32ZVBC32-NEXT:    lw s0, 28(sp) # 4-byte Folded Reload
-; RV32ZVBC32-NEXT:    lw s1, 24(sp) # 4-byte Folded Reload
-; RV32ZVBC32-NEXT:    lw s2, 20(sp) # 4-byte Folded Reload
-; RV32ZVBC32-NEXT:    lw s3, 16(sp) # 4-byte Folded Reload
-; RV32ZVBC32-NEXT:    lw s4, 12(sp) # 4-byte Folded Reload
-; RV32ZVBC32-NEXT:    lw s5, 8(sp) # 4-byte Folded Reload
-; RV32ZVBC32-NEXT:    lw s6, 4(sp) # 4-byte Folded Reload
-; RV32ZVBC32-NEXT:    lw s7, 0(sp) # 4-byte Folded Reload
-; RV32ZVBC32-NEXT:    addi sp, sp, 32
+; RV32ZVBC32-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
+; RV32ZVBC32-NEXT:    vclmul.vv v8, v8, v16
 ; RV32ZVBC32-NEXT:    ret
 ;
-; RV64ZVBC32-LABEL: clmul_nxv4i32_vx:
+; RV64ZVBC32-LABEL: clmul_nxv32i16_vv:
 ; RV64ZVBC32:       # %bb.0:
-; RV64ZVBC32-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV64ZVBC32-NEXT:    vmv.v.x v10, a0
-; RV64ZVBC32-NEXT:    li a0, 16
-; RV64ZVBC32-NEXT:    vand.vi v12, v10, 2
-; RV64ZVBC32-NEXT:    vand.vi v14, v10, 1
-; RV64ZVBC32-NEXT:    vmul.vv v12, v8, v12
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v14, v12
-; RV64ZVBC32-NEXT:    vand.vi v14, v10, 4
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vi v14, v10, 8
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    li a0, 32
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    li a0, 64
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    li a0, 128
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    li a0, 256
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    li a0, 512
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    li a0, 1024
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    li a0, 1
-; RV64ZVBC32-NEXT:    slli a0, a0, 11
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    lui a0, 1
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    lui a0, 2
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    lui a0, 4
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    lui a0, 8
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    lui a0, 16
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    lui a0, 32
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    lui a0, 64
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    lui a0, 128
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    lui a0, 256
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    lui a0, 512
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    lui a0, 1024
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    lui a0, 2048
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    lui a0, 4096
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    lui a0, 8192
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    lui a0, 16384
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    lui a0, 32768
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    lui a0, 65536
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    lui a0, 131072
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    lui a0, 262144
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v10, a0
-; RV64ZVBC32-NEXT:    lui a0, 524288
-; RV64ZVBC32-NEXT:    vand.vx v10, v10, a0
-; RV64ZVBC32-NEXT:    vmul.vv v14, v8, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vmul.vv v8, v8, v10
-; RV64ZVBC32-NEXT:    vxor.vv v8, v12, v8
+; RV64ZVBC32-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
+; RV64ZVBC32-NEXT:    vclmul.vv v8, v8, v16
 ; RV64ZVBC32-NEXT:    ret
-  %elt.head = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
-  %vb = shufflevector <vscale x 4 x i32> %elt.head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-  %v = call <vscale x 4 x i32> @llvm.clmul.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %vb)
-  ret <vscale x 4 x i32> %v
+  %v = call <vscale x 32 x i16> @llvm.clmul.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %vb)
+  ret <vscale x 32 x i16> %v
 }
 
-define <vscale x 8 x i32> @clmul_nxv8i32_vv(<vscale x 8 x i32> %va, <vscale x 8 x i32> %vb) nounwind {
-; RV32V-LABEL: clmul_nxv8i32_vv:
+define <vscale x 32 x i16> @clmul_nxv32i16_vx(<vscale x 32 x i16> %va, i16 %b) nounwind {
+; RV32V-LABEL: clmul_nxv32i16_vx:
 ; RV32V:       # %bb.0:
-; RV32V-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
-; RV32V-NEXT:    vand.vi v16, v12, 2
-; RV32V-NEXT:    vand.vi v20, v12, 1
-; RV32V-NEXT:    vmul.vv v16, v8, v16
-; RV32V-NEXT:    vmul.vv v20, v8, v20
-; RV32V-NEXT:    vxor.vv v16, v20, v16
-; RV32V-NEXT:    vand.vi v20, v12, 4
-; RV32V-NEXT:    vmul.vv v20, v8, v20
-; RV32V-NEXT:    vxor.vv v16, v16, v20
-; RV32V-NEXT:    vand.vi v20, v12, 8
+; RV32V-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; RV32V-NEXT:    vmv.v.x v16, a0
 ; RV32V-NEXT:    li a0, 16
-; RV32V-NEXT:    vmul.vv v20, v8, v20
-; RV32V-NEXT:    vxor.vv v16, v16, v20
-; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    vand.vi v24, v16, 2
+; RV32V-NEXT:    vand.vi v0, v16, 1
+; RV32V-NEXT:    vmul.vv v24, v8, v24
+; RV32V-NEXT:    vmul.vv v0, v8, v0
+; RV32V-NEXT:    vxor.vv v24, v0, v24
+; RV32V-NEXT:    vand.vi v0, v16, 4
+; RV32V-NEXT:    vmul.vv v0, v8, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vi v0, v16, 8
+; RV32V-NEXT:    vmul.vv v0, v8, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v16, a0
 ; RV32V-NEXT:    li a0, 32
-; RV32V-NEXT:    vmul.vv v20, v8, v20
-; RV32V-NEXT:    vxor.vv v16, v16, v20
-; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    vmul.vv v0, v8, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v16, a0
 ; RV32V-NEXT:    li a0, 64
-; RV32V-NEXT:    vmul.vv v20, v8, v20
-; RV32V-NEXT:    vxor.vv v16, v16, v20
-; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    vmul.vv v0, v8, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v16, a0
 ; RV32V-NEXT:    li a0, 128
-; RV32V-NEXT:    vmul.vv v20, v8, v20
-; RV32V-NEXT:    vxor.vv v16, v16, v20
-; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    vmul.vv v0, v8, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v16, a0
 ; RV32V-NEXT:    li a0, 256
-; RV32V-NEXT:    vmul.vv v20, v8, v20
-; RV32V-NEXT:    vxor.vv v16, v16, v20
-; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    vmul.vv v0, v8, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v16, a0
 ; RV32V-NEXT:    li a0, 512
-; RV32V-NEXT:    vmul.vv v20, v8, v20
-; RV32V-NEXT:    vxor.vv v16, v16, v20
-; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    vmul.vv v0, v8, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v16, a0
 ; RV32V-NEXT:    li a0, 1024
-; RV32V-NEXT:    vmul.vv v20, v8, v20
-; RV32V-NEXT:    vxor.vv v16, v16, v20
-; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    vmul.vv v0, v8, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v16, a0
 ; RV32V-NEXT:    li a0, 1
 ; RV32V-NEXT:    slli a0, a0, 11
-; RV32V-NEXT:    vmul.vv v20, v8, v20
-; RV32V-NEXT:    vxor.vv v16, v16, v20
-; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    vmul.vv v0, v8, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v16, a0
 ; RV32V-NEXT:    lui a0, 1
-; RV32V-NEXT:    vmul.vv v20, v8, v20
-; RV32V-NEXT:    vxor.vv v16, v16, v20
-; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    vmul.vv v0, v8, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v16, a0
 ; RV32V-NEXT:    lui a0, 2
-; RV32V-NEXT:    vmul.vv v20, v8, v20
-; RV32V-NEXT:    vxor.vv v16, v16, v20
-; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    vmul.vv v0, v8, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v16, a0
 ; RV32V-NEXT:    lui a0, 4
-; RV32V-NEXT:    vmul.vv v20, v8, v20
-; RV32V-NEXT:    vxor.vv v16, v16, v20
-; RV32V-NEXT:    vand.vx v20, v12, a0
-; RV32V-NEXT:    lui a0, 8
-; RV32V-NEXT:    vmul.vv v20, v8, v20
-; RV32V-NEXT:    vxor.vv v16, v16, v20
-; RV32V-NEXT:    vand.vx v20, v12, a0
-; RV32V-NEXT:    lui a0, 16
-; RV32V-NEXT:    vmul.vv v20, v8, v20
-; RV32V-NEXT:    vxor.vv v16, v16, v20
-; RV32V-NEXT:    vand.vx v20, v12, a0
-; RV32V-NEXT:    lui a0, 32
-; RV32V-NEXT:    vmul.vv v20, v8, v20
-; RV32V-NEXT:    vxor.vv v16, v16, v20
-; RV32V-NEXT:    vand.vx v20, v12, a0
-; RV32V-NEXT:    lui a0, 64
-; RV32V-NEXT:    vmul.vv v20, v8, v20
-; RV32V-NEXT:    vxor.vv v16, v16, v20
-; RV32V-NEXT:    vand.vx v20, v12, a0
-; RV32V-NEXT:    lui a0, 128
-; RV32V-NEXT:    vmul.vv v20, v8, v20
-; RV32V-NEXT:    vxor.vv v16, v16, v20
-; RV32V-NEXT:    vand.vx v20, v12, a0
-; RV32V-NEXT:    lui a0, 256
-; RV32V-NEXT:    vmul.vv v20, v8, v20
-; RV32V-NEXT:    vxor.vv v16, v16, v20
-; RV32V-NEXT:    vand.vx v20, v12, a0
-; RV32V-NEXT:    lui a0, 512
-; RV32V-NEXT:    vmul.vv v20, v8, v20
-; RV32V-NEXT:    vxor.vv v16, v16, v20
-; RV32V-NEXT:    vand.vx v20, v12, a0
-; RV32V-NEXT:    lui a0, 1024
-; RV32V-NEXT:    vmul.vv v20, v8, v20
-; RV32V-NEXT:    vxor.vv v16, v16, v20
-; RV32V-NEXT:    vand.vx v20, v12, a0
-; RV32V-NEXT:    lui a0, 2048
-; RV32V-NEXT:    vmul.vv v20, v8, v20
-; RV32V-NEXT:    vxor.vv v16, v16, v20
-; RV32V-NEXT:    vand.vx v20, v12, a0
-; RV32V-NEXT:    lui a0, 4096
-; RV32V-NEXT:    vmul.vv v20, v8, v20
-; RV32V-NEXT:    vxor.vv v16, v16, v20
-; RV32V-NEXT:    vand.vx v20, v12, a0
-; RV32V-NEXT:    lui a0, 8192
-; RV32V-NEXT:    vmul.vv v20, v8, v20
-; RV32V-NEXT:    vxor.vv v16, v16, v20
-; RV32V-NEXT:    vand.vx v20, v12, a0
-; RV32V-NEXT:    lui a0, 16384
-; RV32V-NEXT:    vmul.vv v20, v8, v20
-; RV32V-NEXT:    vxor.vv v16, v16, v20
-; RV32V-NEXT:    vand.vx v20, v12, a0
-; RV32V-NEXT:    lui a0, 32768
-; RV32V-NEXT:    vmul.vv v20, v8, v20
-; RV32V-NEXT:    vxor.vv v16, v16, v20
-; RV32V-NEXT:    vand.vx v20, v12, a0
-; RV32V-NEXT:    lui a0, 65536
-; RV32V-NEXT:    vmul.vv v20, v8, v20
-; RV32V-NEXT:    vxor.vv v16, v16, v20
-; RV32V-NEXT:    vand.vx v20, v12, a0
-; RV32V-NEXT:    lui a0, 131072
-; RV32V-NEXT:    vmul.vv v20, v8, v20
-; RV32V-NEXT:    vxor.vv v16, v16, v20
-; RV32V-NEXT:    vand.vx v20, v12, a0
-; RV32V-NEXT:    lui a0, 262144
-; RV32V-NEXT:    vmul.vv v20, v8, v20
-; RV32V-NEXT:    vxor.vv v16, v16, v20
-; RV32V-NEXT:    vand.vx v20, v12, a0
-; RV32V-NEXT:    lui a0, 524288
-; RV32V-NEXT:    vand.vx v12, v12, a0
-; RV32V-NEXT:    vmul.vv v20, v8, v20
-; RV32V-NEXT:    vxor.vv v16, v16, v20
-; RV32V-NEXT:    vmul.vv v8, v8, v12
-; RV32V-NEXT:    vxor.vv v8, v16, v8
+; RV32V-NEXT:    vmul.vv v0, v8, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v16, a0
+; RV32V-NEXT:    lui a0, 8
+; RV32V-NEXT:    vand.vx v16, v16, a0
+; RV32V-NEXT:    vmul.vv v0, v8, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vmul.vv v8, v8, v16
+; RV32V-NEXT:    vxor.vv v8, v24, v8
 ; RV32V-NEXT:    ret
 ;
-; RV64V-LABEL: clmul_nxv8i32_vv:
+; RV64V-LABEL: clmul_nxv32i16_vx:
 ; RV64V:       # %bb.0:
-; RV64V-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
-; RV64V-NEXT:    vand.vi v16, v12, 2
-; RV64V-NEXT:    vand.vi v20, v12, 1
-; RV64V-NEXT:    vmul.vv v16, v8, v16
-; RV64V-NEXT:    vmul.vv v20, v8, v20
-; RV64V-NEXT:    vxor.vv v16, v20, v16
-; RV64V-NEXT:    vand.vi v20, v12, 4
-; RV64V-NEXT:    vmul.vv v20, v8, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vi v20, v12, 8
+; RV64V-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; RV64V-NEXT:    vmv.v.x v16, a0
 ; RV64V-NEXT:    li a0, 16
-; RV64V-NEXT:    vmul.vv v20, v8, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    vand.vi v24, v16, 2
+; RV64V-NEXT:    vand.vi v0, v16, 1
+; RV64V-NEXT:    vmul.vv v24, v8, v24
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v0, v24
+; RV64V-NEXT:    vand.vi v0, v16, 4
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vi v0, v16, 8
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
 ; RV64V-NEXT:    li a0, 32
-; RV64V-NEXT:    vmul.vv v20, v8, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
 ; RV64V-NEXT:    li a0, 64
-; RV64V-NEXT:    vmul.vv v20, v8, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
 ; RV64V-NEXT:    li a0, 128
-; RV64V-NEXT:    vmul.vv v20, v8, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
 ; RV64V-NEXT:    li a0, 256
-; RV64V-NEXT:    vmul.vv v20, v8, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
 ; RV64V-NEXT:    li a0, 512
-; RV64V-NEXT:    vmul.vv v20, v8, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
 ; RV64V-NEXT:    li a0, 1024
-; RV64V-NEXT:    vmul.vv v20, v8, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
 ; RV64V-NEXT:    li a0, 1
 ; RV64V-NEXT:    slli a0, a0, 11
-; RV64V-NEXT:    vmul.vv v20, v8, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
 ; RV64V-NEXT:    lui a0, 1
-; RV64V-NEXT:    vmul.vv v20, v8, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
 ; RV64V-NEXT:    lui a0, 2
-; RV64V-NEXT:    vmul.vv v20, v8, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
 ; RV64V-NEXT:    lui a0, 4
-; RV64V-NEXT:    vmul.vv v20, v8, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
 ; RV64V-NEXT:    lui a0, 8
-; RV64V-NEXT:    vmul.vv v20, v8, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v12, a0
-; RV64V-NEXT:    lui a0, 16
-; RV64V-NEXT:    vmul.vv v20, v8, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v12, a0
-; RV64V-NEXT:    lui a0, 32
-; RV64V-NEXT:    vmul.vv v20, v8, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v12, a0
-; RV64V-NEXT:    lui a0, 64
-; RV64V-NEXT:    vmul.vv v20, v8, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v12, a0
-; RV64V-NEXT:    lui a0, 128
-; RV64V-NEXT:    vmul.vv v20, v8, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v12, a0
-; RV64V-NEXT:    lui a0, 256
-; RV64V-NEXT:    vmul.vv v20, v8, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v12, a0
-; RV64V-NEXT:    lui a0, 512
-; RV64V-NEXT:    vmul.vv v20, v8, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v12, a0
-; RV64V-NEXT:    lui a0, 1024
-; RV64V-NEXT:    vmul.vv v20, v8, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v12, a0
-; RV64V-NEXT:    lui a0, 2048
-; RV64V-NEXT:    vmul.vv v20, v8, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v12, a0
-; RV64V-NEXT:    lui a0, 4096
-; RV64V-NEXT:    vmul.vv v20, v8, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v12, a0
-; RV64V-NEXT:    lui a0, 8192
-; RV64V-NEXT:    vmul.vv v20, v8, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v12, a0
-; RV64V-NEXT:    lui a0, 16384
-; RV64V-NEXT:    vmul.vv v20, v8, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v12, a0
-; RV64V-NEXT:    lui a0, 32768
-; RV64V-NEXT:    vmul.vv v20, v8, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v12, a0
-; RV64V-NEXT:    lui a0, 65536
-; RV64V-NEXT:    vmul.vv v20, v8, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v12, a0
-; RV64V-NEXT:    lui a0, 131072
-; RV64V-NEXT:    vmul.vv v20, v8, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v12, a0
-; RV64V-NEXT:    lui a0, 262144
-; RV64V-NEXT:    vmul.vv v20, v8, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v12, a0
-; RV64V-NEXT:    lui a0, 524288
-; RV64V-NEXT:    vand.vx v12, v12, a0
-; RV64V-NEXT:    vmul.vv v20, v8, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vmul.vv v8, v8, v12
-; RV64V-NEXT:    vxor.vv v8, v16, v8
+; RV64V-NEXT:    vand.vx v16, v16, a0
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vmul.vv v8, v8, v16
+; RV64V-NEXT:    vxor.vv v8, v24, v8
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC64-LABEL: clmul_nxv8i32_vv:
+; RV32ZVBC64-LABEL: clmul_nxv32i16_vx:
 ; RV32ZVBC64:       # %bb.0:
-; RV32ZVBC64-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV32ZVBC64-NEXT:    vzext.vf2 v16, v12
-; RV32ZVBC64-NEXT:    vzext.vf2 v24, v8
-; RV32ZVBC64-NEXT:    vclmul.vv v16, v24, v16
-; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; RV32ZVBC64-NEXT:    vnsrl.wi v8, v16, 0
+; RV32ZVBC64-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; RV32ZVBC64-NEXT:    vmv.v.x v16, a0
+; RV32ZVBC64-NEXT:    li a0, 16
+; RV32ZVBC64-NEXT:    vand.vi v24, v16, 2
+; RV32ZVBC64-NEXT:    vand.vi v0, v16, 1
+; RV32ZVBC64-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v0, v24
+; RV32ZVBC64-NEXT:    vand.vi v0, v16, 4
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vi v0, v16, 8
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV32ZVBC64-NEXT:    li a0, 32
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV32ZVBC64-NEXT:    li a0, 64
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV32ZVBC64-NEXT:    li a0, 128
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV32ZVBC64-NEXT:    li a0, 256
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV32ZVBC64-NEXT:    li a0, 512
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV32ZVBC64-NEXT:    li a0, 1024
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV32ZVBC64-NEXT:    li a0, 1
+; RV32ZVBC64-NEXT:    slli a0, a0, 11
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV32ZVBC64-NEXT:    lui a0, 1
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV32ZVBC64-NEXT:    lui a0, 2
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV32ZVBC64-NEXT:    lui a0, 4
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV32ZVBC64-NEXT:    lui a0, 8
+; RV32ZVBC64-NEXT:    vand.vx v16, v16, a0
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vmul.vv v8, v8, v16
+; RV32ZVBC64-NEXT:    vxor.vv v8, v24, v8
 ; RV32ZVBC64-NEXT:    ret
 ;
-; RV64ZVBC64-LABEL: clmul_nxv8i32_vv:
+; RV64ZVBC64-LABEL: clmul_nxv32i16_vx:
 ; RV64ZVBC64:       # %bb.0:
-; RV64ZVBC64-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV64ZVBC64-NEXT:    vzext.vf2 v16, v12
-; RV64ZVBC64-NEXT:    vzext.vf2 v24, v8
-; RV64ZVBC64-NEXT:    vclmul.vv v16, v24, v16
-; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; RV64ZVBC64-NEXT:    vnsrl.wi v8, v16, 0
+; RV64ZVBC64-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; RV64ZVBC64-NEXT:    vmv.v.x v16, a0
+; RV64ZVBC64-NEXT:    li a0, 16
+; RV64ZVBC64-NEXT:    vand.vi v24, v16, 2
+; RV64ZVBC64-NEXT:    vand.vi v0, v16, 1
+; RV64ZVBC64-NEXT:    vmul.vv v24, v8, v24
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v0, v24
+; RV64ZVBC64-NEXT:    vand.vi v0, v16, 4
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vi v0, v16, 8
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    li a0, 32
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    li a0, 64
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    li a0, 128
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    li a0, 256
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    li a0, 512
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    li a0, 1024
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    li a0, 1
+; RV64ZVBC64-NEXT:    slli a0, a0, 11
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    lui a0, 1
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    lui a0, 2
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    lui a0, 4
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    lui a0, 8
+; RV64ZVBC64-NEXT:    vand.vx v16, v16, a0
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vmul.vv v8, v8, v16
+; RV64ZVBC64-NEXT:    vxor.vv v8, v24, v8
 ; RV64ZVBC64-NEXT:    ret
 ;
-; RV32ZVBC32-LABEL: clmul_nxv8i32_vv:
+; RV32ZVBC32-LABEL: clmul_nxv32i16_vx:
 ; RV32ZVBC32:       # %bb.0:
-; RV32ZVBC32-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
-; RV32ZVBC32-NEXT:    vand.vi v16, v12, 2
-; RV32ZVBC32-NEXT:    vand.vi v20, v12, 1
-; RV32ZVBC32-NEXT:    vmul.vv v16, v8, v16
-; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV32ZVBC32-NEXT:    vxor.vv v16, v20, v16
-; RV32ZVBC32-NEXT:    vand.vi v20, v12, 4
-; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV32ZVBC32-NEXT:    vand.vi v20, v12, 8
-; RV32ZVBC32-NEXT:    li a0, 16
-; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV32ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV32ZVBC32-NEXT:    li a0, 32
-; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV32ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV32ZVBC32-NEXT:    li a0, 64
-; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV32ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV32ZVBC32-NEXT:    li a0, 128
-; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV32ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV32ZVBC32-NEXT:    li a0, 256
-; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV32ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV32ZVBC32-NEXT:    li a0, 512
-; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV32ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV32ZVBC32-NEXT:    li a0, 1024
-; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV32ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV32ZVBC32-NEXT:    li a0, 1
-; RV32ZVBC32-NEXT:    slli a0, a0, 11
-; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV32ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV32ZVBC32-NEXT:    lui a0, 1
-; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV32ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV32ZVBC32-NEXT:    lui a0, 2
-; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV32ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV32ZVBC32-NEXT:    lui a0, 4
-; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV32ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV32ZVBC32-NEXT:    lui a0, 8
-; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV32ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV32ZVBC32-NEXT:    lui a0, 16
-; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV32ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV32ZVBC32-NEXT:    lui a0, 32
-; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV32ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV32ZVBC32-NEXT:    lui a0, 64
-; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV32ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV32ZVBC32-NEXT:    lui a0, 128
-; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV32ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV32ZVBC32-NEXT:    lui a0, 256
-; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV32ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV32ZVBC32-NEXT:    lui a0, 512
-; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV32ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV32ZVBC32-NEXT:    lui a0, 1024
-; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV32ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV32ZVBC32-NEXT:    lui a0, 2048
-; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV32ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV32ZVBC32-NEXT:    lui a0, 4096
-; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV32ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV32ZVBC32-NEXT:    lui a0, 8192
-; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV32ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV32ZVBC32-NEXT:    lui a0, 16384
-; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV32ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV32ZVBC32-NEXT:    lui a0, 32768
-; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV32ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV32ZVBC32-NEXT:    lui a0, 65536
-; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV32ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV32ZVBC32-NEXT:    lui a0, 131072
-; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV32ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV32ZVBC32-NEXT:    lui a0, 262144
-; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV32ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV32ZVBC32-NEXT:    lui a0, 524288
-; RV32ZVBC32-NEXT:    vand.vx v12, v12, a0
-; RV32ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV32ZVBC32-NEXT:    vmul.vv v8, v8, v12
-; RV32ZVBC32-NEXT:    vxor.vv v8, v16, v8
+; RV32ZVBC32-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; RV32ZVBC32-NEXT:    vclmul.vx v8, v8, a0
 ; RV32ZVBC32-NEXT:    ret
 ;
-; RV64ZVBC32-LABEL: clmul_nxv8i32_vv:
+; RV64ZVBC32-LABEL: clmul_nxv32i16_vx:
 ; RV64ZVBC32:       # %bb.0:
-; RV64ZVBC32-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
-; RV64ZVBC32-NEXT:    vand.vi v16, v12, 2
-; RV64ZVBC32-NEXT:    vand.vi v20, v12, 1
-; RV64ZVBC32-NEXT:    vmul.vv v16, v8, v16
-; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v20, v16
-; RV64ZVBC32-NEXT:    vand.vi v20, v12, 4
-; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vi v20, v12, 8
-; RV64ZVBC32-NEXT:    li a0, 16
-; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV64ZVBC32-NEXT:    li a0, 32
-; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV64ZVBC32-NEXT:    li a0, 64
-; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV64ZVBC32-NEXT:    li a0, 128
-; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV64ZVBC32-NEXT:    li a0, 256
-; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV64ZVBC32-NEXT:    li a0, 512
-; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV64ZVBC32-NEXT:    li a0, 1024
-; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV64ZVBC32-NEXT:    li a0, 1
-; RV64ZVBC32-NEXT:    slli a0, a0, 11
-; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV64ZVBC32-NEXT:    lui a0, 1
-; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV64ZVBC32-NEXT:    lui a0, 2
-; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV64ZVBC32-NEXT:    lui a0, 4
-; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV64ZVBC32-NEXT:    lui a0, 8
-; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV64ZVBC32-NEXT:    lui a0, 16
-; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV64ZVBC32-NEXT:    lui a0, 32
-; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV64ZVBC32-NEXT:    lui a0, 64
-; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV64ZVBC32-NEXT:    lui a0, 128
-; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV64ZVBC32-NEXT:    lui a0, 256
-; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV64ZVBC32-NEXT:    lui a0, 512
-; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV64ZVBC32-NEXT:    lui a0, 1024
-; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV64ZVBC32-NEXT:    lui a0, 2048
-; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV64ZVBC32-NEXT:    lui a0, 4096
-; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV64ZVBC32-NEXT:    lui a0, 8192
-; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV64ZVBC32-NEXT:    lui a0, 16384
-; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV64ZVBC32-NEXT:    lui a0, 32768
-; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV64ZVBC32-NEXT:    lui a0, 65536
-; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV64ZVBC32-NEXT:    lui a0, 131072
-; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV64ZVBC32-NEXT:    lui a0, 262144
-; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV64ZVBC32-NEXT:    lui a0, 524288
-; RV64ZVBC32-NEXT:    vand.vx v12, v12, a0
-; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vmul.vv v8, v8, v12
-; RV64ZVBC32-NEXT:    vxor.vv v8, v16, v8
+; RV64ZVBC32-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; RV64ZVBC32-NEXT:    vclmul.vx v8, v8, a0
+; RV64ZVBC32-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 32 x i16> %elt.head, <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i16> @llvm.clmul.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %vb)
+  ret <vscale x 32 x i16> %v
+}
+
+define <vscale x 1 x i32> @clmul_nxv1i32_vv(<vscale x 1 x i32> %va, <vscale x 1 x i32> %vb) nounwind {
+; RV32V-LABEL: clmul_nxv1i32_vv:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; RV32V-NEXT:    vand.vi v10, v9, 2
+; RV32V-NEXT:    vand.vi v11, v9, 1
+; RV32V-NEXT:    vmul.vv v10, v8, v10
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v11, v10
+; RV32V-NEXT:    vand.vi v11, v9, 4
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vi v11, v9, 8
+; RV32V-NEXT:    li a0, 16
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 32
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 64
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 128
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 256
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 512
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 1024
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 1
+; RV32V-NEXT:    slli a0, a0, 11
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 1
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 2
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 4
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 8
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 16
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 32
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 64
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 128
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 256
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 512
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 1024
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 2048
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 4096
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 8192
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 16384
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 32768
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 65536
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 131072
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 262144
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 524288
+; RV32V-NEXT:    vand.vx v9, v9, a0
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vmul.vv v8, v8, v9
+; RV32V-NEXT:    vxor.vv v8, v10, v8
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: clmul_nxv1i32_vv:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; RV64V-NEXT:    vand.vi v10, v9, 2
+; RV64V-NEXT:    vand.vi v11, v9, 1
+; RV64V-NEXT:    vmul.vv v10, v8, v10
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v11, v10
+; RV64V-NEXT:    vand.vi v11, v9, 4
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vi v11, v9, 8
+; RV64V-NEXT:    li a0, 16
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 32
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 64
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 128
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 256
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 512
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 1024
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 1
+; RV64V-NEXT:    slli a0, a0, 11
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 1
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 2
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 4
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 8
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 16
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 32
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 64
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 128
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 256
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 512
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 1024
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 2048
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 4096
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 8192
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 16384
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 32768
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 65536
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 131072
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 262144
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 524288
+; RV64V-NEXT:    vand.vx v9, v9, a0
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vmul.vv v8, v8, v9
+; RV64V-NEXT:    vxor.vv v8, v10, v8
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC64-LABEL: clmul_nxv1i32_vv:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf2 v10, v9
+; RV32ZVBC64-NEXT:    vzext.vf2 v9, v8
+; RV32ZVBC64-NEXT:    vclmul.vv v8, v9, v10
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v8, 0
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmul_nxv1i32_vv:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf2 v10, v9
+; RV64ZVBC64-NEXT:    vzext.vf2 v9, v8
+; RV64ZVBC64-NEXT:    vclmul.vv v8, v9, v10
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v8, 0
+; RV64ZVBC64-NEXT:    ret
+;
+; RV32ZVBC32-LABEL: clmul_nxv1i32_vv:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; RV32ZVBC32-NEXT:    vclmul.vv v8, v8, v9
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmul_nxv1i32_vv:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; RV64ZVBC32-NEXT:    vclmul.vv v8, v8, v9
+; RV64ZVBC32-NEXT:    ret
+  %v = call <vscale x 1 x i32> @llvm.clmul.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %vb)
+  ret <vscale x 1 x i32> %v
+}
+
+define <vscale x 1 x i32> @clmul_nxv1i32_vx(<vscale x 1 x i32> %va, i32 %b) nounwind {
+; RV32V-LABEL: clmul_nxv1i32_vx:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    addi sp, sp, -32
+; RV32V-NEXT:    sw s0, 28(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s1, 24(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s2, 20(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s3, 16(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s4, 12(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s5, 8(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s6, 4(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s7, 0(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    andi s6, a0, 2
+; RV32V-NEXT:    andi s5, a0, 1
+; RV32V-NEXT:    andi s3, a0, 4
+; RV32V-NEXT:    andi s1, a0, 8
+; RV32V-NEXT:    andi t6, a0, 16
+; RV32V-NEXT:    andi t4, a0, 32
+; RV32V-NEXT:    andi t2, a0, 64
+; RV32V-NEXT:    andi t0, a0, 128
+; RV32V-NEXT:    andi a6, a0, 256
+; RV32V-NEXT:    andi a4, a0, 512
+; RV32V-NEXT:    andi a2, a0, 1024
+; RV32V-NEXT:    li a1, 1
+; RV32V-NEXT:    lui a3, 1
+; RV32V-NEXT:    lui a5, 2
+; RV32V-NEXT:    lui a7, 4
+; RV32V-NEXT:    lui t1, 8
+; RV32V-NEXT:    lui t3, 16
+; RV32V-NEXT:    lui t5, 32
+; RV32V-NEXT:    lui s0, 64
+; RV32V-NEXT:    lui s2, 128
+; RV32V-NEXT:    lui s4, 256
+; RV32V-NEXT:    vsetvli s7, zero, e32, mf2, ta, ma
+; RV32V-NEXT:    vmul.vx v9, v8, s6
+; RV32V-NEXT:    lui s6, 512
+; RV32V-NEXT:    vmul.vx v10, v8, s5
+; RV32V-NEXT:    lui s5, 1024
+; RV32V-NEXT:    vxor.vv v9, v10, v9
+; RV32V-NEXT:    vmul.vx v10, v8, s3
+; RV32V-NEXT:    lui s3, 2048
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, s1
+; RV32V-NEXT:    lui s1, 4096
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, t6
+; RV32V-NEXT:    lui t6, 8192
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, t4
+; RV32V-NEXT:    lui t4, 16384
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, t2
+; RV32V-NEXT:    lui t2, 32768
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, t0
+; RV32V-NEXT:    lui t0, 65536
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, a6
+; RV32V-NEXT:    lui a6, 131072
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, a4
+; RV32V-NEXT:    lui a4, 262144
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, a2
+; RV32V-NEXT:    lui a2, 524288
+; RV32V-NEXT:    slli a1, a1, 11
+; RV32V-NEXT:    and a3, a0, a3
+; RV32V-NEXT:    and a5, a0, a5
+; RV32V-NEXT:    and a7, a0, a7
+; RV32V-NEXT:    and t1, a0, t1
+; RV32V-NEXT:    and t3, a0, t3
+; RV32V-NEXT:    and t5, a0, t5
+; RV32V-NEXT:    and s0, a0, s0
+; RV32V-NEXT:    and s2, a0, s2
+; RV32V-NEXT:    and s4, a0, s4
+; RV32V-NEXT:    and s6, a0, s6
+; RV32V-NEXT:    and s5, a0, s5
+; RV32V-NEXT:    and s3, a0, s3
+; RV32V-NEXT:    and s1, a0, s1
+; RV32V-NEXT:    and t6, a0, t6
+; RV32V-NEXT:    and t4, a0, t4
+; RV32V-NEXT:    and t2, a0, t2
+; RV32V-NEXT:    and t0, a0, t0
+; RV32V-NEXT:    and a6, a0, a6
+; RV32V-NEXT:    and a4, a0, a4
+; RV32V-NEXT:    and a2, a0, a2
+; RV32V-NEXT:    and a0, a0, a1
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, a0
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, a3
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, a5
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, a7
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, t1
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, t3
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, t5
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, s0
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, s2
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, s4
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, s6
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, s5
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, s3
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, s1
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, t6
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, t4
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, t2
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, t0
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, a6
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, a4
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v8, v8, a2
+; RV32V-NEXT:    vxor.vv v8, v9, v8
+; RV32V-NEXT:    lw s0, 28(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s1, 24(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s2, 20(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s3, 16(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s4, 12(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s5, 8(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s6, 4(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s7, 0(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    addi sp, sp, 32
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: clmul_nxv1i32_vx:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
+; RV64V-NEXT:    vmv.v.x v9, a0
+; RV64V-NEXT:    li a0, 16
+; RV64V-NEXT:    vand.vi v10, v9, 2
+; RV64V-NEXT:    vand.vi v11, v9, 1
+; RV64V-NEXT:    vmul.vv v10, v8, v10
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v11, v10
+; RV64V-NEXT:    vand.vi v11, v9, 4
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vi v11, v9, 8
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 32
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 64
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 128
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 256
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 512
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 1024
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 1
+; RV64V-NEXT:    slli a0, a0, 11
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 1
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 2
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 4
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 8
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 16
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 32
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 64
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 128
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 256
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 512
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 1024
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 2048
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 4096
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 8192
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 16384
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 32768
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 65536
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 131072
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 262144
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 524288
+; RV64V-NEXT:    vand.vx v9, v9, a0
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vmul.vv v8, v8, v9
+; RV64V-NEXT:    vxor.vv v8, v10, v8
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC64-LABEL: clmul_nxv1i32_vx:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
+; RV32ZVBC64-NEXT:    vmv.v.x v9, a0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf2 v10, v8
+; RV32ZVBC64-NEXT:    vzext.vf2 v8, v9
+; RV32ZVBC64-NEXT:    vclmul.vv v8, v10, v8
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v8, 0
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmul_nxv1i32_vx:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
+; RV64ZVBC64-NEXT:    vmv.v.x v9, a0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf2 v10, v8
+; RV64ZVBC64-NEXT:    vzext.vf2 v8, v9
+; RV64ZVBC64-NEXT:    vclmul.vv v8, v10, v8
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v8, 0
+; RV64ZVBC64-NEXT:    ret
+;
+; RV32ZVBC32-LABEL: clmul_nxv1i32_vx:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
+; RV32ZVBC32-NEXT:    vclmul.vx v8, v8, a0
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmul_nxv1i32_vx:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
+; RV64ZVBC32-NEXT:    vclmul.vx v8, v8, a0
+; RV64ZVBC32-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i32> %elt.head, <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i32> @llvm.clmul.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %vb)
+  ret <vscale x 1 x i32> %v
+}
+
+define <vscale x 2 x i32> @clmul_nxv2i32_vv(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb) nounwind {
+; RV32V-LABEL: clmul_nxv2i32_vv:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; RV32V-NEXT:    vand.vi v10, v9, 2
+; RV32V-NEXT:    vand.vi v11, v9, 1
+; RV32V-NEXT:    vmul.vv v10, v8, v10
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v11, v10
+; RV32V-NEXT:    vand.vi v11, v9, 4
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vi v11, v9, 8
+; RV32V-NEXT:    li a0, 16
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 32
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 64
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 128
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 256
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 512
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 1024
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 1
+; RV32V-NEXT:    slli a0, a0, 11
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 1
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 2
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 4
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 8
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 16
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 32
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 64
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 128
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 256
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 512
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 1024
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 2048
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 4096
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 8192
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 16384
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 32768
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 65536
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 131072
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 262144
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 524288
+; RV32V-NEXT:    vand.vx v9, v9, a0
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vmul.vv v8, v8, v9
+; RV32V-NEXT:    vxor.vv v8, v10, v8
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: clmul_nxv2i32_vv:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; RV64V-NEXT:    vand.vi v10, v9, 2
+; RV64V-NEXT:    vand.vi v11, v9, 1
+; RV64V-NEXT:    vmul.vv v10, v8, v10
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v11, v10
+; RV64V-NEXT:    vand.vi v11, v9, 4
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vi v11, v9, 8
+; RV64V-NEXT:    li a0, 16
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 32
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 64
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 128
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 256
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 512
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 1024
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 1
+; RV64V-NEXT:    slli a0, a0, 11
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 1
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 2
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 4
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 8
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 16
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 32
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 64
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 128
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 256
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 512
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 1024
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 2048
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 4096
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 8192
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 16384
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 32768
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 65536
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 131072
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 262144
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 524288
+; RV64V-NEXT:    vand.vx v9, v9, a0
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vmul.vv v8, v8, v9
+; RV64V-NEXT:    vxor.vv v8, v10, v8
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC64-LABEL: clmul_nxv2i32_vv:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf2 v10, v9
+; RV32ZVBC64-NEXT:    vzext.vf2 v12, v8
+; RV32ZVBC64-NEXT:    vclmul.vv v10, v12, v10
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v10, 0
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmul_nxv2i32_vv:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf2 v10, v9
+; RV64ZVBC64-NEXT:    vzext.vf2 v12, v8
+; RV64ZVBC64-NEXT:    vclmul.vv v10, v12, v10
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v10, 0
+; RV64ZVBC64-NEXT:    ret
+;
+; RV32ZVBC32-LABEL: clmul_nxv2i32_vv:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; RV32ZVBC32-NEXT:    vclmul.vv v8, v8, v9
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmul_nxv2i32_vv:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; RV64ZVBC32-NEXT:    vclmul.vv v8, v8, v9
+; RV64ZVBC32-NEXT:    ret
+  %v = call <vscale x 2 x i32> @llvm.clmul.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 2 x i32> @clmul_nxv2i32_vx(<vscale x 2 x i32> %va, i32 %b) nounwind {
+; RV32V-LABEL: clmul_nxv2i32_vx:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    addi sp, sp, -32
+; RV32V-NEXT:    sw s0, 28(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s1, 24(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s2, 20(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s3, 16(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s4, 12(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s5, 8(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s6, 4(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s7, 0(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    andi s6, a0, 2
+; RV32V-NEXT:    andi s5, a0, 1
+; RV32V-NEXT:    andi s3, a0, 4
+; RV32V-NEXT:    andi s1, a0, 8
+; RV32V-NEXT:    andi t6, a0, 16
+; RV32V-NEXT:    andi t4, a0, 32
+; RV32V-NEXT:    andi t2, a0, 64
+; RV32V-NEXT:    andi t0, a0, 128
+; RV32V-NEXT:    andi a6, a0, 256
+; RV32V-NEXT:    andi a4, a0, 512
+; RV32V-NEXT:    andi a2, a0, 1024
+; RV32V-NEXT:    li a1, 1
+; RV32V-NEXT:    lui a3, 1
+; RV32V-NEXT:    lui a5, 2
+; RV32V-NEXT:    lui a7, 4
+; RV32V-NEXT:    lui t1, 8
+; RV32V-NEXT:    lui t3, 16
+; RV32V-NEXT:    lui t5, 32
+; RV32V-NEXT:    lui s0, 64
+; RV32V-NEXT:    lui s2, 128
+; RV32V-NEXT:    lui s4, 256
+; RV32V-NEXT:    vsetvli s7, zero, e32, m1, ta, ma
+; RV32V-NEXT:    vmul.vx v9, v8, s6
+; RV32V-NEXT:    lui s6, 512
+; RV32V-NEXT:    vmul.vx v10, v8, s5
+; RV32V-NEXT:    lui s5, 1024
+; RV32V-NEXT:    vxor.vv v9, v10, v9
+; RV32V-NEXT:    vmul.vx v10, v8, s3
+; RV32V-NEXT:    lui s3, 2048
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, s1
+; RV32V-NEXT:    lui s1, 4096
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, t6
+; RV32V-NEXT:    lui t6, 8192
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, t4
+; RV32V-NEXT:    lui t4, 16384
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, t2
+; RV32V-NEXT:    lui t2, 32768
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, t0
+; RV32V-NEXT:    lui t0, 65536
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, a6
+; RV32V-NEXT:    lui a6, 131072
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, a4
+; RV32V-NEXT:    lui a4, 262144
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, a2
+; RV32V-NEXT:    lui a2, 524288
+; RV32V-NEXT:    slli a1, a1, 11
+; RV32V-NEXT:    and a3, a0, a3
+; RV32V-NEXT:    and a5, a0, a5
+; RV32V-NEXT:    and a7, a0, a7
+; RV32V-NEXT:    and t1, a0, t1
+; RV32V-NEXT:    and t3, a0, t3
+; RV32V-NEXT:    and t5, a0, t5
+; RV32V-NEXT:    and s0, a0, s0
+; RV32V-NEXT:    and s2, a0, s2
+; RV32V-NEXT:    and s4, a0, s4
+; RV32V-NEXT:    and s6, a0, s6
+; RV32V-NEXT:    and s5, a0, s5
+; RV32V-NEXT:    and s3, a0, s3
+; RV32V-NEXT:    and s1, a0, s1
+; RV32V-NEXT:    and t6, a0, t6
+; RV32V-NEXT:    and t4, a0, t4
+; RV32V-NEXT:    and t2, a0, t2
+; RV32V-NEXT:    and t0, a0, t0
+; RV32V-NEXT:    and a6, a0, a6
+; RV32V-NEXT:    and a4, a0, a4
+; RV32V-NEXT:    and a2, a0, a2
+; RV32V-NEXT:    and a0, a0, a1
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, a0
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, a3
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, a5
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, a7
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, t1
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, t3
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, t5
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, s0
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, s2
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, s4
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, s6
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, s5
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, s3
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, s1
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, t6
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, t4
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, t2
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, t0
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, a6
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, a4
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v8, v8, a2
+; RV32V-NEXT:    vxor.vv v8, v9, v8
+; RV32V-NEXT:    lw s0, 28(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s1, 24(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s2, 20(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s3, 16(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s4, 12(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s5, 8(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s6, 4(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s7, 0(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    addi sp, sp, 32
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: clmul_nxv2i32_vx:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; RV64V-NEXT:    vmv.v.x v9, a0
+; RV64V-NEXT:    li a0, 16
+; RV64V-NEXT:    vand.vi v10, v9, 2
+; RV64V-NEXT:    vand.vi v11, v9, 1
+; RV64V-NEXT:    vmul.vv v10, v8, v10
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v11, v10
+; RV64V-NEXT:    vand.vi v11, v9, 4
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vi v11, v9, 8
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 32
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 64
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 128
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 256
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 512
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 1024
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 1
+; RV64V-NEXT:    slli a0, a0, 11
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 1
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 2
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 4
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 8
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 16
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 32
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 64
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 128
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 256
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 512
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 1024
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 2048
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 4096
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 8192
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 16384
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 32768
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 65536
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 131072
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 262144
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 524288
+; RV64V-NEXT:    vand.vx v9, v9, a0
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vmul.vv v8, v8, v9
+; RV64V-NEXT:    vxor.vv v8, v10, v8
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC64-LABEL: clmul_nxv2i32_vx:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; RV32ZVBC64-NEXT:    vmv.v.x v12, a0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf2 v10, v8
+; RV32ZVBC64-NEXT:    vzext.vf2 v8, v12
+; RV32ZVBC64-NEXT:    vclmul.vv v10, v10, v8
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v10, 0
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmul_nxv2i32_vx:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; RV64ZVBC64-NEXT:    vmv.v.x v12, a0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf2 v10, v8
+; RV64ZVBC64-NEXT:    vzext.vf2 v8, v12
+; RV64ZVBC64-NEXT:    vclmul.vv v10, v10, v8
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v10, 0
+; RV64ZVBC64-NEXT:    ret
+;
+; RV32ZVBC32-LABEL: clmul_nxv2i32_vx:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; RV32ZVBC32-NEXT:    vclmul.vx v8, v8, a0
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmul_nxv2i32_vx:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; RV64ZVBC32-NEXT:    vclmul.vx v8, v8, a0
+; RV64ZVBC32-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i32> %elt.head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i32> @llvm.clmul.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 4 x i32> @clmul_nxv4i32_vv(<vscale x 4 x i32> %va, <vscale x 4 x i32> %vb) nounwind {
+; RV32V-LABEL: clmul_nxv4i32_vv:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; RV32V-NEXT:    vand.vi v12, v10, 2
+; RV32V-NEXT:    vand.vi v14, v10, 1
+; RV32V-NEXT:    vmul.vv v12, v8, v12
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v14, v12
+; RV32V-NEXT:    vand.vi v14, v10, 4
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vi v14, v10, 8
+; RV32V-NEXT:    li a0, 16
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    li a0, 32
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    li a0, 64
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    li a0, 128
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    li a0, 256
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    li a0, 512
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    li a0, 1024
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    li a0, 1
+; RV32V-NEXT:    slli a0, a0, 11
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    lui a0, 1
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    lui a0, 2
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    lui a0, 4
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    lui a0, 8
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    lui a0, 16
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    lui a0, 32
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    lui a0, 64
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    lui a0, 128
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    lui a0, 256
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    lui a0, 512
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    lui a0, 1024
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    lui a0, 2048
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    lui a0, 4096
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    lui a0, 8192
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    lui a0, 16384
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    lui a0, 32768
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    lui a0, 65536
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    lui a0, 131072
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    lui a0, 262144
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    lui a0, 524288
+; RV32V-NEXT:    vand.vx v10, v10, a0
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vmul.vv v8, v8, v10
+; RV32V-NEXT:    vxor.vv v8, v12, v8
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: clmul_nxv4i32_vv:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; RV64V-NEXT:    vand.vi v12, v10, 2
+; RV64V-NEXT:    vand.vi v14, v10, 1
+; RV64V-NEXT:    vmul.vv v12, v8, v12
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v14, v12
+; RV64V-NEXT:    vand.vi v14, v10, 4
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vi v14, v10, 8
+; RV64V-NEXT:    li a0, 16
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    li a0, 32
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    li a0, 64
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    li a0, 128
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    li a0, 256
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    li a0, 512
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    li a0, 1024
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    li a0, 1
+; RV64V-NEXT:    slli a0, a0, 11
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 1
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 2
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 4
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 8
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 16
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 32
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 64
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 128
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 256
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 512
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 1024
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 2048
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 4096
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 8192
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 16384
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 32768
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 65536
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 131072
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 262144
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 524288
+; RV64V-NEXT:    vand.vx v10, v10, a0
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vmul.vv v8, v8, v10
+; RV64V-NEXT:    vxor.vv v8, v12, v8
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC64-LABEL: clmul_nxv4i32_vv:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf2 v12, v10
+; RV32ZVBC64-NEXT:    vzext.vf2 v16, v8
+; RV32ZVBC64-NEXT:    vclmul.vv v12, v16, v12
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v12, 0
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmul_nxv4i32_vv:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf2 v12, v10
+; RV64ZVBC64-NEXT:    vzext.vf2 v16, v8
+; RV64ZVBC64-NEXT:    vclmul.vv v12, v16, v12
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v12, 0
+; RV64ZVBC64-NEXT:    ret
+;
+; RV32ZVBC32-LABEL: clmul_nxv4i32_vv:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; RV32ZVBC32-NEXT:    vclmul.vv v8, v8, v10
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmul_nxv4i32_vv:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; RV64ZVBC32-NEXT:    vclmul.vv v8, v8, v10
+; RV64ZVBC32-NEXT:    ret
+  %v = call <vscale x 4 x i32> @llvm.clmul.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %vb)
+  ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 4 x i32> @clmul_nxv4i32_vx(<vscale x 4 x i32> %va, i32 %b) nounwind {
+; RV32V-LABEL: clmul_nxv4i32_vx:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    addi sp, sp, -32
+; RV32V-NEXT:    sw s0, 28(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s1, 24(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s2, 20(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s3, 16(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s4, 12(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s5, 8(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s6, 4(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s7, 0(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    andi s6, a0, 2
+; RV32V-NEXT:    andi s5, a0, 1
+; RV32V-NEXT:    andi s3, a0, 4
+; RV32V-NEXT:    andi s1, a0, 8
+; RV32V-NEXT:    andi t6, a0, 16
+; RV32V-NEXT:    andi t4, a0, 32
+; RV32V-NEXT:    andi t2, a0, 64
+; RV32V-NEXT:    andi t0, a0, 128
+; RV32V-NEXT:    andi a6, a0, 256
+; RV32V-NEXT:    andi a4, a0, 512
+; RV32V-NEXT:    andi a2, a0, 1024
+; RV32V-NEXT:    li a1, 1
+; RV32V-NEXT:    lui a3, 1
+; RV32V-NEXT:    lui a5, 2
+; RV32V-NEXT:    lui a7, 4
+; RV32V-NEXT:    lui t1, 8
+; RV32V-NEXT:    lui t3, 16
+; RV32V-NEXT:    lui t5, 32
+; RV32V-NEXT:    lui s0, 64
+; RV32V-NEXT:    lui s2, 128
+; RV32V-NEXT:    lui s4, 256
+; RV32V-NEXT:    vsetvli s7, zero, e32, m2, ta, ma
+; RV32V-NEXT:    vmul.vx v10, v8, s6
+; RV32V-NEXT:    lui s6, 512
+; RV32V-NEXT:    vmul.vx v12, v8, s5
+; RV32V-NEXT:    lui s5, 1024
+; RV32V-NEXT:    vxor.vv v10, v12, v10
+; RV32V-NEXT:    vmul.vx v12, v8, s3
+; RV32V-NEXT:    lui s3, 2048
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vmul.vx v12, v8, s1
+; RV32V-NEXT:    lui s1, 4096
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vmul.vx v12, v8, t6
+; RV32V-NEXT:    lui t6, 8192
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vmul.vx v12, v8, t4
+; RV32V-NEXT:    lui t4, 16384
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vmul.vx v12, v8, t2
+; RV32V-NEXT:    lui t2, 32768
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vmul.vx v12, v8, t0
+; RV32V-NEXT:    lui t0, 65536
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vmul.vx v12, v8, a6
+; RV32V-NEXT:    lui a6, 131072
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vmul.vx v12, v8, a4
+; RV32V-NEXT:    lui a4, 262144
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vmul.vx v12, v8, a2
+; RV32V-NEXT:    lui a2, 524288
+; RV32V-NEXT:    slli a1, a1, 11
+; RV32V-NEXT:    and a3, a0, a3
+; RV32V-NEXT:    and a5, a0, a5
+; RV32V-NEXT:    and a7, a0, a7
+; RV32V-NEXT:    and t1, a0, t1
+; RV32V-NEXT:    and t3, a0, t3
+; RV32V-NEXT:    and t5, a0, t5
+; RV32V-NEXT:    and s0, a0, s0
+; RV32V-NEXT:    and s2, a0, s2
+; RV32V-NEXT:    and s4, a0, s4
+; RV32V-NEXT:    and s6, a0, s6
+; RV32V-NEXT:    and s5, a0, s5
+; RV32V-NEXT:    and s3, a0, s3
+; RV32V-NEXT:    and s1, a0, s1
+; RV32V-NEXT:    and t6, a0, t6
+; RV32V-NEXT:    and t4, a0, t4
+; RV32V-NEXT:    and t2, a0, t2
+; RV32V-NEXT:    and t0, a0, t0
+; RV32V-NEXT:    and a6, a0, a6
+; RV32V-NEXT:    and a4, a0, a4
+; RV32V-NEXT:    and a2, a0, a2
+; RV32V-NEXT:    and a0, a0, a1
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vmul.vx v12, v8, a0
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vmul.vx v12, v8, a3
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vmul.vx v12, v8, a5
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vmul.vx v12, v8, a7
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vmul.vx v12, v8, t1
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vmul.vx v12, v8, t3
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vmul.vx v12, v8, t5
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vmul.vx v12, v8, s0
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vmul.vx v12, v8, s2
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vmul.vx v12, v8, s4
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vmul.vx v12, v8, s6
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vmul.vx v12, v8, s5
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vmul.vx v12, v8, s3
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vmul.vx v12, v8, s1
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vmul.vx v12, v8, t6
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vmul.vx v12, v8, t4
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vmul.vx v12, v8, t2
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vmul.vx v12, v8, t0
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vmul.vx v12, v8, a6
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vmul.vx v12, v8, a4
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vmul.vx v8, v8, a2
+; RV32V-NEXT:    vxor.vv v8, v10, v8
+; RV32V-NEXT:    lw s0, 28(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s1, 24(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s2, 20(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s3, 16(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s4, 12(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s5, 8(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s6, 4(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s7, 0(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    addi sp, sp, 32
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: clmul_nxv4i32_vx:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; RV64V-NEXT:    vmv.v.x v10, a0
+; RV64V-NEXT:    li a0, 16
+; RV64V-NEXT:    vand.vi v12, v10, 2
+; RV64V-NEXT:    vand.vi v14, v10, 1
+; RV64V-NEXT:    vmul.vv v12, v8, v12
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v14, v12
+; RV64V-NEXT:    vand.vi v14, v10, 4
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vi v14, v10, 8
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    li a0, 32
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    li a0, 64
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    li a0, 128
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    li a0, 256
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    li a0, 512
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    li a0, 1024
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    li a0, 1
+; RV64V-NEXT:    slli a0, a0, 11
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 1
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 2
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 4
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 8
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 16
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 32
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 64
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 128
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 256
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 512
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 1024
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 2048
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 4096
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 8192
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 16384
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 32768
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 65536
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 131072
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 262144
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 524288
+; RV64V-NEXT:    vand.vx v10, v10, a0
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vmul.vv v8, v8, v10
+; RV64V-NEXT:    vxor.vv v8, v12, v8
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC64-LABEL: clmul_nxv4i32_vx:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; RV32ZVBC64-NEXT:    vmv.v.x v16, a0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf2 v12, v8
+; RV32ZVBC64-NEXT:    vzext.vf2 v8, v16
+; RV32ZVBC64-NEXT:    vclmul.vv v12, v12, v8
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v12, 0
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmul_nxv4i32_vx:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; RV64ZVBC64-NEXT:    vmv.v.x v16, a0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf2 v12, v8
+; RV64ZVBC64-NEXT:    vzext.vf2 v8, v16
+; RV64ZVBC64-NEXT:    vclmul.vv v12, v12, v8
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v12, 0
+; RV64ZVBC64-NEXT:    ret
+;
+; RV32ZVBC32-LABEL: clmul_nxv4i32_vx:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; RV32ZVBC32-NEXT:    vclmul.vx v8, v8, a0
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmul_nxv4i32_vx:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; RV64ZVBC32-NEXT:    vclmul.vx v8, v8, a0
+; RV64ZVBC32-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i32> %elt.head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i32> @llvm.clmul.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %vb)
+  ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 8 x i32> @clmul_nxv8i32_vv(<vscale x 8 x i32> %va, <vscale x 8 x i32> %vb) nounwind {
+; RV32V-LABEL: clmul_nxv8i32_vv:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; RV32V-NEXT:    vand.vi v16, v12, 2
+; RV32V-NEXT:    vand.vi v20, v12, 1
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v20, v16
+; RV32V-NEXT:    vand.vi v20, v12, 4
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vi v20, v12, 8
+; RV32V-NEXT:    li a0, 16
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    li a0, 32
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    li a0, 64
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    li a0, 128
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    li a0, 256
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    li a0, 512
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    li a0, 1024
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    li a0, 1
+; RV32V-NEXT:    slli a0, a0, 11
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    lui a0, 1
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    lui a0, 2
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    lui a0, 4
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    lui a0, 8
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    lui a0, 16
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    lui a0, 32
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    lui a0, 64
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    lui a0, 128
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    lui a0, 256
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    lui a0, 512
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    lui a0, 1024
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    lui a0, 2048
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    lui a0, 4096
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    lui a0, 8192
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    lui a0, 16384
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    lui a0, 32768
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    lui a0, 65536
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    lui a0, 131072
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    lui a0, 262144
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    lui a0, 524288
+; RV32V-NEXT:    vand.vx v12, v12, a0
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vmul.vv v8, v8, v12
+; RV32V-NEXT:    vxor.vv v8, v16, v8
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: clmul_nxv8i32_vv:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; RV64V-NEXT:    vand.vi v16, v12, 2
+; RV64V-NEXT:    vand.vi v20, v12, 1
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v20, v16
+; RV64V-NEXT:    vand.vi v20, v12, 4
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vi v20, v12, 8
+; RV64V-NEXT:    li a0, 16
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    li a0, 32
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    li a0, 64
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    li a0, 128
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    li a0, 256
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    li a0, 512
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    li a0, 1024
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    li a0, 1
+; RV64V-NEXT:    slli a0, a0, 11
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 1
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 2
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 4
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 8
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 16
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 32
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 64
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 128
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 256
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 512
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 1024
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 2048
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 4096
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 8192
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 16384
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 32768
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 65536
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 131072
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 262144
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 524288
+; RV64V-NEXT:    vand.vx v12, v12, a0
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vmul.vv v8, v8, v12
+; RV64V-NEXT:    vxor.vv v8, v16, v8
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC64-LABEL: clmul_nxv8i32_vv:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf2 v16, v12
+; RV32ZVBC64-NEXT:    vzext.vf2 v24, v8
+; RV32ZVBC64-NEXT:    vclmul.vv v16, v24, v16
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v16, 0
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmul_nxv8i32_vv:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf2 v16, v12
+; RV64ZVBC64-NEXT:    vzext.vf2 v24, v8
+; RV64ZVBC64-NEXT:    vclmul.vv v16, v24, v16
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v16, 0
+; RV64ZVBC64-NEXT:    ret
+;
+; RV32ZVBC32-LABEL: clmul_nxv8i32_vv:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; RV32ZVBC32-NEXT:    vclmul.vv v8, v8, v12
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmul_nxv8i32_vv:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; RV64ZVBC32-NEXT:    vclmul.vv v8, v8, v12
+; RV64ZVBC32-NEXT:    ret
+  %v = call <vscale x 8 x i32> @llvm.clmul.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %vb)
+  ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @clmul_nxv8i32_vx(<vscale x 8 x i32> %va, i32 %b) nounwind {
+; RV32V-LABEL: clmul_nxv8i32_vx:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    addi sp, sp, -32
+; RV32V-NEXT:    sw s0, 28(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s1, 24(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s2, 20(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s3, 16(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s4, 12(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s5, 8(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s6, 4(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s7, 0(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    andi s6, a0, 2
+; RV32V-NEXT:    andi s5, a0, 1
+; RV32V-NEXT:    andi s3, a0, 4
+; RV32V-NEXT:    andi s1, a0, 8
+; RV32V-NEXT:    andi t6, a0, 16
+; RV32V-NEXT:    andi t4, a0, 32
+; RV32V-NEXT:    andi t2, a0, 64
+; RV32V-NEXT:    andi t0, a0, 128
+; RV32V-NEXT:    andi a6, a0, 256
+; RV32V-NEXT:    andi a4, a0, 512
+; RV32V-NEXT:    andi a2, a0, 1024
+; RV32V-NEXT:    li a1, 1
+; RV32V-NEXT:    lui a3, 1
+; RV32V-NEXT:    lui a5, 2
+; RV32V-NEXT:    lui a7, 4
+; RV32V-NEXT:    lui t1, 8
+; RV32V-NEXT:    lui t3, 16
+; RV32V-NEXT:    lui t5, 32
+; RV32V-NEXT:    lui s0, 64
+; RV32V-NEXT:    lui s2, 128
+; RV32V-NEXT:    lui s4, 256
+; RV32V-NEXT:    vsetvli s7, zero, e32, m4, ta, ma
+; RV32V-NEXT:    vmul.vx v12, v8, s6
+; RV32V-NEXT:    lui s6, 512
+; RV32V-NEXT:    vmul.vx v16, v8, s5
+; RV32V-NEXT:    lui s5, 1024
+; RV32V-NEXT:    vxor.vv v12, v16, v12
+; RV32V-NEXT:    vmul.vx v16, v8, s3
+; RV32V-NEXT:    lui s3, 2048
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vmul.vx v16, v8, s1
+; RV32V-NEXT:    lui s1, 4096
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vmul.vx v16, v8, t6
+; RV32V-NEXT:    lui t6, 8192
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vmul.vx v16, v8, t4
+; RV32V-NEXT:    lui t4, 16384
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vmul.vx v16, v8, t2
+; RV32V-NEXT:    lui t2, 32768
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vmul.vx v16, v8, t0
+; RV32V-NEXT:    lui t0, 65536
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vmul.vx v16, v8, a6
+; RV32V-NEXT:    lui a6, 131072
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vmul.vx v16, v8, a4
+; RV32V-NEXT:    lui a4, 262144
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vmul.vx v16, v8, a2
+; RV32V-NEXT:    lui a2, 524288
+; RV32V-NEXT:    slli a1, a1, 11
+; RV32V-NEXT:    and a3, a0, a3
+; RV32V-NEXT:    and a5, a0, a5
+; RV32V-NEXT:    and a7, a0, a7
+; RV32V-NEXT:    and t1, a0, t1
+; RV32V-NEXT:    and t3, a0, t3
+; RV32V-NEXT:    and t5, a0, t5
+; RV32V-NEXT:    and s0, a0, s0
+; RV32V-NEXT:    and s2, a0, s2
+; RV32V-NEXT:    and s4, a0, s4
+; RV32V-NEXT:    and s6, a0, s6
+; RV32V-NEXT:    and s5, a0, s5
+; RV32V-NEXT:    and s3, a0, s3
+; RV32V-NEXT:    and s1, a0, s1
+; RV32V-NEXT:    and t6, a0, t6
+; RV32V-NEXT:    and t4, a0, t4
+; RV32V-NEXT:    and t2, a0, t2
+; RV32V-NEXT:    and t0, a0, t0
+; RV32V-NEXT:    and a6, a0, a6
+; RV32V-NEXT:    and a4, a0, a4
+; RV32V-NEXT:    and a2, a0, a2
+; RV32V-NEXT:    and a0, a0, a1
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vmul.vx v16, v8, a0
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vmul.vx v16, v8, a3
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vmul.vx v16, v8, a5
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vmul.vx v16, v8, a7
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vmul.vx v16, v8, t1
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vmul.vx v16, v8, t3
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vmul.vx v16, v8, t5
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vmul.vx v16, v8, s0
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vmul.vx v16, v8, s2
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vmul.vx v16, v8, s4
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vmul.vx v16, v8, s6
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vmul.vx v16, v8, s5
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vmul.vx v16, v8, s3
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vmul.vx v16, v8, s1
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vmul.vx v16, v8, t6
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vmul.vx v16, v8, t4
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vmul.vx v16, v8, t2
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vmul.vx v16, v8, t0
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vmul.vx v16, v8, a6
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vmul.vx v16, v8, a4
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vmul.vx v8, v8, a2
+; RV32V-NEXT:    vxor.vv v8, v12, v8
+; RV32V-NEXT:    lw s0, 28(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s1, 24(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s2, 20(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s3, 16(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s4, 12(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s5, 8(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s6, 4(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s7, 0(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    addi sp, sp, 32
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: clmul_nxv8i32_vx:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
+; RV64V-NEXT:    vmv.v.x v12, a0
+; RV64V-NEXT:    li a0, 16
+; RV64V-NEXT:    vand.vi v16, v12, 2
+; RV64V-NEXT:    vand.vi v20, v12, 1
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v20, v16
+; RV64V-NEXT:    vand.vi v20, v12, 4
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vi v20, v12, 8
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    li a0, 32
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    li a0, 64
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    li a0, 128
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    li a0, 256
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    li a0, 512
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    li a0, 1024
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    li a0, 1
+; RV64V-NEXT:    slli a0, a0, 11
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 1
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 2
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 4
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 8
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 16
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 32
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 64
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 128
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 256
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 512
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 1024
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 2048
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 4096
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 8192
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 16384
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 32768
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 65536
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 131072
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 262144
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 524288
+; RV64V-NEXT:    vand.vx v12, v12, a0
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vmul.vv v8, v8, v12
+; RV64V-NEXT:    vxor.vv v8, v16, v8
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC64-LABEL: clmul_nxv8i32_vx:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
+; RV32ZVBC64-NEXT:    vmv.v.x v24, a0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf2 v16, v8
+; RV32ZVBC64-NEXT:    vzext.vf2 v8, v24
+; RV32ZVBC64-NEXT:    vclmul.vv v16, v16, v8
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v16, 0
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmul_nxv8i32_vx:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
+; RV64ZVBC64-NEXT:    vmv.v.x v24, a0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf2 v16, v8
+; RV64ZVBC64-NEXT:    vzext.vf2 v8, v24
+; RV64ZVBC64-NEXT:    vclmul.vv v16, v16, v8
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v16, 0
+; RV64ZVBC64-NEXT:    ret
+;
+; RV32ZVBC32-LABEL: clmul_nxv8i32_vx:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
+; RV32ZVBC32-NEXT:    vclmul.vx v8, v8, a0
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmul_nxv8i32_vx:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
+; RV64ZVBC32-NEXT:    vclmul.vx v8, v8, a0
+; RV64ZVBC32-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i32> %elt.head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i32> @llvm.clmul.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %vb)
+  ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 16 x i32> @clmul_nxv16i32_vv(<vscale x 16 x i32> %va, <vscale x 16 x i32> %vb) nounwind {
+; RV32V-LABEL: clmul_nxv16i32_vv:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; RV32V-NEXT:    vand.vi v24, v16, 2
+; RV32V-NEXT:    vand.vi v0, v16, 1
+; RV32V-NEXT:    vmul.vv v24, v8, v24
+; RV32V-NEXT:    vmul.vv v0, v8, v0
+; RV32V-NEXT:    vxor.vv v24, v0, v24
+; RV32V-NEXT:    vand.vi v0, v16, 4
+; RV32V-NEXT:    vmul.vv v0, v8, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vi v0, v16, 8
+; RV32V-NEXT:    li a0, 16
+; RV32V-NEXT:    vmul.vv v0, v8, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v16, a0
+; RV32V-NEXT:    li a0, 32
+; RV32V-NEXT:    vmul.vv v0, v8, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v16, a0
+; RV32V-NEXT:    li a0, 64
+; RV32V-NEXT:    vmul.vv v0, v8, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v16, a0
+; RV32V-NEXT:    li a0, 128
+; RV32V-NEXT:    vmul.vv v0, v8, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v16, a0
+; RV32V-NEXT:    li a0, 256
+; RV32V-NEXT:    vmul.vv v0, v8, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v16, a0
+; RV32V-NEXT:    li a0, 512
+; RV32V-NEXT:    vmul.vv v0, v8, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v16, a0
+; RV32V-NEXT:    li a0, 1024
+; RV32V-NEXT:    vmul.vv v0, v8, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v16, a0
+; RV32V-NEXT:    li a0, 1
+; RV32V-NEXT:    slli a0, a0, 11
+; RV32V-NEXT:    vmul.vv v0, v8, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v16, a0
+; RV32V-NEXT:    lui a0, 1
+; RV32V-NEXT:    vmul.vv v0, v8, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v16, a0
+; RV32V-NEXT:    lui a0, 2
+; RV32V-NEXT:    vmul.vv v0, v8, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v16, a0
+; RV32V-NEXT:    lui a0, 4
+; RV32V-NEXT:    vmul.vv v0, v8, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v16, a0
+; RV32V-NEXT:    lui a0, 8
+; RV32V-NEXT:    vmul.vv v0, v8, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v16, a0
+; RV32V-NEXT:    lui a0, 16
+; RV32V-NEXT:    vmul.vv v0, v8, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v16, a0
+; RV32V-NEXT:    lui a0, 32
+; RV32V-NEXT:    vmul.vv v0, v8, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v16, a0
+; RV32V-NEXT:    lui a0, 64
+; RV32V-NEXT:    vmul.vv v0, v8, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v16, a0
+; RV32V-NEXT:    lui a0, 128
+; RV32V-NEXT:    vmul.vv v0, v8, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v16, a0
+; RV32V-NEXT:    lui a0, 256
+; RV32V-NEXT:    vmul.vv v0, v8, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v16, a0
+; RV32V-NEXT:    lui a0, 512
+; RV32V-NEXT:    vmul.vv v0, v8, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v16, a0
+; RV32V-NEXT:    lui a0, 1024
+; RV32V-NEXT:    vmul.vv v0, v8, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v16, a0
+; RV32V-NEXT:    lui a0, 2048
+; RV32V-NEXT:    vmul.vv v0, v8, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v16, a0
+; RV32V-NEXT:    lui a0, 4096
+; RV32V-NEXT:    vmul.vv v0, v8, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v16, a0
+; RV32V-NEXT:    lui a0, 8192
+; RV32V-NEXT:    vmul.vv v0, v8, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v16, a0
+; RV32V-NEXT:    lui a0, 16384
+; RV32V-NEXT:    vmul.vv v0, v8, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v16, a0
+; RV32V-NEXT:    lui a0, 32768
+; RV32V-NEXT:    vmul.vv v0, v8, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v16, a0
+; RV32V-NEXT:    lui a0, 65536
+; RV32V-NEXT:    vmul.vv v0, v8, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v16, a0
+; RV32V-NEXT:    lui a0, 131072
+; RV32V-NEXT:    vmul.vv v0, v8, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v16, a0
+; RV32V-NEXT:    lui a0, 262144
+; RV32V-NEXT:    vmul.vv v0, v8, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v16, a0
+; RV32V-NEXT:    lui a0, 524288
+; RV32V-NEXT:    vand.vx v16, v16, a0
+; RV32V-NEXT:    vmul.vv v0, v8, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vmul.vv v8, v8, v16
+; RV32V-NEXT:    vxor.vv v8, v24, v8
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: clmul_nxv16i32_vv:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; RV64V-NEXT:    vand.vi v24, v16, 2
+; RV64V-NEXT:    vand.vi v0, v16, 1
+; RV64V-NEXT:    vmul.vv v24, v8, v24
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v0, v24
+; RV64V-NEXT:    vand.vi v0, v16, 4
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vi v0, v16, 8
+; RV64V-NEXT:    li a0, 16
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
+; RV64V-NEXT:    li a0, 32
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
+; RV64V-NEXT:    li a0, 64
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
+; RV64V-NEXT:    li a0, 128
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
+; RV64V-NEXT:    li a0, 256
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
+; RV64V-NEXT:    li a0, 512
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
+; RV64V-NEXT:    li a0, 1024
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
+; RV64V-NEXT:    li a0, 1
+; RV64V-NEXT:    slli a0, a0, 11
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
+; RV64V-NEXT:    lui a0, 1
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
+; RV64V-NEXT:    lui a0, 2
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
+; RV64V-NEXT:    lui a0, 4
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
+; RV64V-NEXT:    lui a0, 8
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
+; RV64V-NEXT:    lui a0, 16
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
+; RV64V-NEXT:    lui a0, 32
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
+; RV64V-NEXT:    lui a0, 64
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
+; RV64V-NEXT:    lui a0, 128
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
+; RV64V-NEXT:    lui a0, 256
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
+; RV64V-NEXT:    lui a0, 512
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
+; RV64V-NEXT:    lui a0, 1024
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
+; RV64V-NEXT:    lui a0, 2048
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
+; RV64V-NEXT:    lui a0, 4096
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
+; RV64V-NEXT:    lui a0, 8192
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
+; RV64V-NEXT:    lui a0, 16384
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
+; RV64V-NEXT:    lui a0, 32768
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
+; RV64V-NEXT:    lui a0, 65536
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
+; RV64V-NEXT:    lui a0, 131072
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
+; RV64V-NEXT:    lui a0, 262144
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
+; RV64V-NEXT:    lui a0, 524288
+; RV64V-NEXT:    vand.vx v16, v16, a0
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vmul.vv v8, v8, v16
+; RV64V-NEXT:    vxor.vv v8, v24, v8
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC64-LABEL: clmul_nxv16i32_vv:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; RV32ZVBC64-NEXT:    vand.vi v24, v16, 2
+; RV32ZVBC64-NEXT:    vand.vi v0, v16, 1
+; RV32ZVBC64-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v0, v24
+; RV32ZVBC64-NEXT:    vand.vi v0, v16, 4
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vi v0, v16, 8
+; RV32ZVBC64-NEXT:    li a0, 16
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV32ZVBC64-NEXT:    li a0, 32
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV32ZVBC64-NEXT:    li a0, 64
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV32ZVBC64-NEXT:    li a0, 128
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV32ZVBC64-NEXT:    li a0, 256
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV32ZVBC64-NEXT:    li a0, 512
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV32ZVBC64-NEXT:    li a0, 1024
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV32ZVBC64-NEXT:    li a0, 1
+; RV32ZVBC64-NEXT:    slli a0, a0, 11
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV32ZVBC64-NEXT:    lui a0, 1
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV32ZVBC64-NEXT:    lui a0, 2
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV32ZVBC64-NEXT:    lui a0, 4
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV32ZVBC64-NEXT:    lui a0, 8
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV32ZVBC64-NEXT:    lui a0, 16
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV32ZVBC64-NEXT:    lui a0, 32
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV32ZVBC64-NEXT:    lui a0, 64
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV32ZVBC64-NEXT:    lui a0, 128
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV32ZVBC64-NEXT:    lui a0, 256
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV32ZVBC64-NEXT:    lui a0, 512
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV32ZVBC64-NEXT:    lui a0, 1024
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV32ZVBC64-NEXT:    lui a0, 2048
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV32ZVBC64-NEXT:    lui a0, 4096
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV32ZVBC64-NEXT:    lui a0, 8192
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV32ZVBC64-NEXT:    lui a0, 16384
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV32ZVBC64-NEXT:    lui a0, 32768
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV32ZVBC64-NEXT:    lui a0, 65536
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV32ZVBC64-NEXT:    lui a0, 131072
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV32ZVBC64-NEXT:    lui a0, 262144
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV32ZVBC64-NEXT:    lui a0, 524288
+; RV32ZVBC64-NEXT:    vand.vx v16, v16, a0
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vmul.vv v8, v8, v16
+; RV32ZVBC64-NEXT:    vxor.vv v8, v24, v8
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmul_nxv16i32_vv:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; RV64ZVBC64-NEXT:    vand.vi v24, v16, 2
+; RV64ZVBC64-NEXT:    vand.vi v0, v16, 1
+; RV64ZVBC64-NEXT:    vmul.vv v24, v8, v24
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v0, v24
+; RV64ZVBC64-NEXT:    vand.vi v0, v16, 4
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vi v0, v16, 8
+; RV64ZVBC64-NEXT:    li a0, 16
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    li a0, 32
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    li a0, 64
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    li a0, 128
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    li a0, 256
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    li a0, 512
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    li a0, 1024
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    li a0, 1
+; RV64ZVBC64-NEXT:    slli a0, a0, 11
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    lui a0, 1
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    lui a0, 2
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    lui a0, 4
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    lui a0, 8
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    lui a0, 16
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    lui a0, 32
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    lui a0, 64
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    lui a0, 128
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    lui a0, 256
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    lui a0, 512
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    lui a0, 1024
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    lui a0, 2048
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    lui a0, 4096
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    lui a0, 8192
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    lui a0, 16384
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    lui a0, 32768
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    lui a0, 65536
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    lui a0, 131072
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    lui a0, 262144
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    lui a0, 524288
+; RV64ZVBC64-NEXT:    vand.vx v16, v16, a0
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vmul.vv v8, v8, v16
+; RV64ZVBC64-NEXT:    vxor.vv v8, v24, v8
+; RV64ZVBC64-NEXT:    ret
+;
+; RV32ZVBC32-LABEL: clmul_nxv16i32_vv:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; RV32ZVBC32-NEXT:    vclmul.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmul_nxv16i32_vv:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; RV64ZVBC32-NEXT:    vclmul.vv v8, v8, v16
 ; RV64ZVBC32-NEXT:    ret
-  %v = call <vscale x 8 x i32> @llvm.clmul.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %vb)
-  ret <vscale x 8 x i32> %v
+  %v = call <vscale x 16 x i32> @llvm.clmul.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %vb)
+  ret <vscale x 16 x i32> %v
 }
 
-define <vscale x 8 x i32> @clmul_nxv8i32_vx(<vscale x 8 x i32> %va, i32 %b) nounwind {
-; RV32V-LABEL: clmul_nxv8i32_vx:
+define <vscale x 16 x i32> @clmul_nxv16i32_vx(<vscale x 16 x i32> %va, i32 %b) nounwind {
+; RV32V-LABEL: clmul_nxv16i32_vx:
 ; RV32V:       # %bb.0:
 ; RV32V-NEXT:    addi sp, sp, -32
 ; RV32V-NEXT:    sw s0, 28(sp) # 4-byte Folded Spill
@@ -8080,37 +7281,37 @@ define <vscale x 8 x i32> @clmul_nxv8i32_vx(<vscale x 8 x i32> %va, i32 %b) noun
 ; RV32V-NEXT:    lui s0, 64
 ; RV32V-NEXT:    lui s2, 128
 ; RV32V-NEXT:    lui s4, 256
-; RV32V-NEXT:    vsetvli s7, zero, e32, m4, ta, ma
-; RV32V-NEXT:    vmul.vx v12, v8, s6
+; RV32V-NEXT:    vsetvli s7, zero, e32, m8, ta, ma
+; RV32V-NEXT:    vmul.vx v16, v8, s6
 ; RV32V-NEXT:    lui s6, 512
-; RV32V-NEXT:    vmul.vx v16, v8, s5
+; RV32V-NEXT:    vmul.vx v24, v8, s5
 ; RV32V-NEXT:    lui s5, 1024
-; RV32V-NEXT:    vxor.vv v12, v16, v12
-; RV32V-NEXT:    vmul.vx v16, v8, s3
+; RV32V-NEXT:    vxor.vv v16, v24, v16
+; RV32V-NEXT:    vmul.vx v24, v8, s3
 ; RV32V-NEXT:    lui s3, 2048
-; RV32V-NEXT:    vxor.vv v12, v12, v16
-; RV32V-NEXT:    vmul.vx v16, v8, s1
+; RV32V-NEXT:    vxor.vv v16, v16, v24
+; RV32V-NEXT:    vmul.vx v24, v8, s1
 ; RV32V-NEXT:    lui s1, 4096
-; RV32V-NEXT:    vxor.vv v12, v12, v16
-; RV32V-NEXT:    vmul.vx v16, v8, t6
+; RV32V-NEXT:    vxor.vv v16, v16, v24
+; RV32V-NEXT:    vmul.vx v24, v8, t6
 ; RV32V-NEXT:    lui t6, 8192
-; RV32V-NEXT:    vxor.vv v12, v12, v16
-; RV32V-NEXT:    vmul.vx v16, v8, t4
+; RV32V-NEXT:    vxor.vv v16, v16, v24
+; RV32V-NEXT:    vmul.vx v24, v8, t4
 ; RV32V-NEXT:    lui t4, 16384
-; RV32V-NEXT:    vxor.vv v12, v12, v16
-; RV32V-NEXT:    vmul.vx v16, v8, t2
+; RV32V-NEXT:    vxor.vv v16, v16, v24
+; RV32V-NEXT:    vmul.vx v24, v8, t2
 ; RV32V-NEXT:    lui t2, 32768
-; RV32V-NEXT:    vxor.vv v12, v12, v16
-; RV32V-NEXT:    vmul.vx v16, v8, t0
+; RV32V-NEXT:    vxor.vv v16, v16, v24
+; RV32V-NEXT:    vmul.vx v24, v8, t0
 ; RV32V-NEXT:    lui t0, 65536
-; RV32V-NEXT:    vxor.vv v12, v12, v16
-; RV32V-NEXT:    vmul.vx v16, v8, a6
+; RV32V-NEXT:    vxor.vv v16, v16, v24
+; RV32V-NEXT:    vmul.vx v24, v8, a6
 ; RV32V-NEXT:    lui a6, 131072
-; RV32V-NEXT:    vxor.vv v12, v12, v16
-; RV32V-NEXT:    vmul.vx v16, v8, a4
+; RV32V-NEXT:    vxor.vv v16, v16, v24
+; RV32V-NEXT:    vmul.vx v24, v8, a4
 ; RV32V-NEXT:    lui a4, 262144
-; RV32V-NEXT:    vxor.vv v12, v12, v16
-; RV32V-NEXT:    vmul.vx v16, v8, a2
+; RV32V-NEXT:    vxor.vv v16, v16, v24
+; RV32V-NEXT:    vmul.vx v24, v8, a2
 ; RV32V-NEXT:    lui a2, 524288
 ; RV32V-NEXT:    slli a1, a1, 11
 ; RV32V-NEXT:    and a3, a0, a3
@@ -8134,49 +7335,49 @@ define <vscale x 8 x i32> @clmul_nxv8i32_vx(<vscale x 8 x i32> %va, i32 %b) noun
 ; RV32V-NEXT:    and a4, a0, a4
 ; RV32V-NEXT:    and a2, a0, a2
 ; RV32V-NEXT:    and a0, a0, a1
-; RV32V-NEXT:    vxor.vv v12, v12, v16
-; RV32V-NEXT:    vmul.vx v16, v8, a0
-; RV32V-NEXT:    vxor.vv v12, v12, v16
-; RV32V-NEXT:    vmul.vx v16, v8, a3
-; RV32V-NEXT:    vxor.vv v12, v12, v16
-; RV32V-NEXT:    vmul.vx v16, v8, a5
-; RV32V-NEXT:    vxor.vv v12, v12, v16
-; RV32V-NEXT:    vmul.vx v16, v8, a7
-; RV32V-NEXT:    vxor.vv v12, v12, v16
-; RV32V-NEXT:    vmul.vx v16, v8, t1
-; RV32V-NEXT:    vxor.vv v12, v12, v16
-; RV32V-NEXT:    vmul.vx v16, v8, t3
-; RV32V-NEXT:    vxor.vv v12, v12, v16
-; RV32V-NEXT:    vmul.vx v16, v8, t5
-; RV32V-NEXT:    vxor.vv v12, v12, v16
-; RV32V-NEXT:    vmul.vx v16, v8, s0
-; RV32V-NEXT:    vxor.vv v12, v12, v16
-; RV32V-NEXT:    vmul.vx v16, v8, s2
-; RV32V-NEXT:    vxor.vv v12, v12, v16
-; RV32V-NEXT:    vmul.vx v16, v8, s4
-; RV32V-NEXT:    vxor.vv v12, v12, v16
-; RV32V-NEXT:    vmul.vx v16, v8, s6
-; RV32V-NEXT:    vxor.vv v12, v12, v16
-; RV32V-NEXT:    vmul.vx v16, v8, s5
-; RV32V-NEXT:    vxor.vv v12, v12, v16
-; RV32V-NEXT:    vmul.vx v16, v8, s3
-; RV32V-NEXT:    vxor.vv v12, v12, v16
-; RV32V-NEXT:    vmul.vx v16, v8, s1
-; RV32V-NEXT:    vxor.vv v12, v12, v16
-; RV32V-NEXT:    vmul.vx v16, v8, t6
-; RV32V-NEXT:    vxor.vv v12, v12, v16
-; RV32V-NEXT:    vmul.vx v16, v8, t4
-; RV32V-NEXT:    vxor.vv v12, v12, v16
-; RV32V-NEXT:    vmul.vx v16, v8, t2
-; RV32V-NEXT:    vxor.vv v12, v12, v16
-; RV32V-NEXT:    vmul.vx v16, v8, t0
-; RV32V-NEXT:    vxor.vv v12, v12, v16
-; RV32V-NEXT:    vmul.vx v16, v8, a6
-; RV32V-NEXT:    vxor.vv v12, v12, v16
-; RV32V-NEXT:    vmul.vx v16, v8, a4
-; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vxor.vv v16, v16, v24
+; RV32V-NEXT:    vmul.vx v24, v8, a0
+; RV32V-NEXT:    vxor.vv v16, v16, v24
+; RV32V-NEXT:    vmul.vx v24, v8, a3
+; RV32V-NEXT:    vxor.vv v16, v16, v24
+; RV32V-NEXT:    vmul.vx v24, v8, a5
+; RV32V-NEXT:    vxor.vv v16, v16, v24
+; RV32V-NEXT:    vmul.vx v24, v8, a7
+; RV32V-NEXT:    vxor.vv v16, v16, v24
+; RV32V-NEXT:    vmul.vx v24, v8, t1
+; RV32V-NEXT:    vxor.vv v16, v16, v24
+; RV32V-NEXT:    vmul.vx v24, v8, t3
+; RV32V-NEXT:    vxor.vv v16, v16, v24
+; RV32V-NEXT:    vmul.vx v24, v8, t5
+; RV32V-NEXT:    vxor.vv v16, v16, v24
+; RV32V-NEXT:    vmul.vx v24, v8, s0
+; RV32V-NEXT:    vxor.vv v16, v16, v24
+; RV32V-NEXT:    vmul.vx v24, v8, s2
+; RV32V-NEXT:    vxor.vv v16, v16, v24
+; RV32V-NEXT:    vmul.vx v24, v8, s4
+; RV32V-NEXT:    vxor.vv v16, v16, v24
+; RV32V-NEXT:    vmul.vx v24, v8, s6
+; RV32V-NEXT:    vxor.vv v16, v16, v24
+; RV32V-NEXT:    vmul.vx v24, v8, s5
+; RV32V-NEXT:    vxor.vv v16, v16, v24
+; RV32V-NEXT:    vmul.vx v24, v8, s3
+; RV32V-NEXT:    vxor.vv v16, v16, v24
+; RV32V-NEXT:    vmul.vx v24, v8, s1
+; RV32V-NEXT:    vxor.vv v16, v16, v24
+; RV32V-NEXT:    vmul.vx v24, v8, t6
+; RV32V-NEXT:    vxor.vv v16, v16, v24
+; RV32V-NEXT:    vmul.vx v24, v8, t4
+; RV32V-NEXT:    vxor.vv v16, v16, v24
+; RV32V-NEXT:    vmul.vx v24, v8, t2
+; RV32V-NEXT:    vxor.vv v16, v16, v24
+; RV32V-NEXT:    vmul.vx v24, v8, t0
+; RV32V-NEXT:    vxor.vv v16, v16, v24
+; RV32V-NEXT:    vmul.vx v24, v8, a6
+; RV32V-NEXT:    vxor.vv v16, v16, v24
+; RV32V-NEXT:    vmul.vx v24, v8, a4
+; RV32V-NEXT:    vxor.vv v16, v16, v24
 ; RV32V-NEXT:    vmul.vx v8, v8, a2
-; RV32V-NEXT:    vxor.vv v8, v12, v8
+; RV32V-NEXT:    vxor.vv v8, v16, v8
 ; RV32V-NEXT:    lw s0, 28(sp) # 4-byte Folded Reload
 ; RV32V-NEXT:    lw s1, 24(sp) # 4-byte Folded Reload
 ; RV32V-NEXT:    lw s2, 20(sp) # 4-byte Folded Reload
@@ -8188,838 +7389,417 @@ define <vscale x 8 x i32> @clmul_nxv8i32_vx(<vscale x 8 x i32> %va, i32 %b) noun
 ; RV32V-NEXT:    addi sp, sp, 32
 ; RV32V-NEXT:    ret
 ;
-; RV64V-LABEL: clmul_nxv8i32_vx:
+; RV64V-LABEL: clmul_nxv16i32_vx:
 ; RV64V:       # %bb.0:
-; RV64V-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV64V-NEXT:    vmv.v.x v12, a0
+; RV64V-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV64V-NEXT:    vmv.v.x v16, a0
 ; RV64V-NEXT:    li a0, 16
-; RV64V-NEXT:    vand.vi v16, v12, 2
-; RV64V-NEXT:    vand.vi v20, v12, 1
-; RV64V-NEXT:    vmul.vv v16, v8, v16
-; RV64V-NEXT:    vmul.vv v20, v8, v20
-; RV64V-NEXT:    vxor.vv v16, v20, v16
-; RV64V-NEXT:    vand.vi v20, v12, 4
-; RV64V-NEXT:    vmul.vv v20, v8, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vi v20, v12, 8
-; RV64V-NEXT:    vmul.vv v20, v8, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    vand.vi v24, v16, 2
+; RV64V-NEXT:    vand.vi v0, v16, 1
+; RV64V-NEXT:    vmul.vv v24, v8, v24
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v0, v24
+; RV64V-NEXT:    vand.vi v0, v16, 4
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vi v0, v16, 8
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
 ; RV64V-NEXT:    li a0, 32
-; RV64V-NEXT:    vmul.vv v20, v8, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
 ; RV64V-NEXT:    li a0, 64
-; RV64V-NEXT:    vmul.vv v20, v8, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
 ; RV64V-NEXT:    li a0, 128
-; RV64V-NEXT:    vmul.vv v20, v8, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
 ; RV64V-NEXT:    li a0, 256
-; RV64V-NEXT:    vmul.vv v20, v8, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
 ; RV64V-NEXT:    li a0, 512
-; RV64V-NEXT:    vmul.vv v20, v8, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
 ; RV64V-NEXT:    li a0, 1024
-; RV64V-NEXT:    vmul.vv v20, v8, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
 ; RV64V-NEXT:    li a0, 1
 ; RV64V-NEXT:    slli a0, a0, 11
-; RV64V-NEXT:    vmul.vv v20, v8, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
 ; RV64V-NEXT:    lui a0, 1
-; RV64V-NEXT:    vmul.vv v20, v8, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
 ; RV64V-NEXT:    lui a0, 2
-; RV64V-NEXT:    vmul.vv v20, v8, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
 ; RV64V-NEXT:    lui a0, 4
-; RV64V-NEXT:    vmul.vv v20, v8, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
 ; RV64V-NEXT:    lui a0, 8
-; RV64V-NEXT:    vmul.vv v20, v8, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
 ; RV64V-NEXT:    lui a0, 16
-; RV64V-NEXT:    vmul.vv v20, v8, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
 ; RV64V-NEXT:    lui a0, 32
-; RV64V-NEXT:    vmul.vv v20, v8, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
 ; RV64V-NEXT:    lui a0, 64
-; RV64V-NEXT:    vmul.vv v20, v8, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
 ; RV64V-NEXT:    lui a0, 128
-; RV64V-NEXT:    vmul.vv v20, v8, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
 ; RV64V-NEXT:    lui a0, 256
-; RV64V-NEXT:    vmul.vv v20, v8, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
 ; RV64V-NEXT:    lui a0, 512
-; RV64V-NEXT:    vmul.vv v20, v8, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
 ; RV64V-NEXT:    lui a0, 1024
-; RV64V-NEXT:    vmul.vv v20, v8, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
 ; RV64V-NEXT:    lui a0, 2048
-; RV64V-NEXT:    vmul.vv v20, v8, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
 ; RV64V-NEXT:    lui a0, 4096
-; RV64V-NEXT:    vmul.vv v20, v8, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
 ; RV64V-NEXT:    lui a0, 8192
-; RV64V-NEXT:    vmul.vv v20, v8, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
 ; RV64V-NEXT:    lui a0, 16384
-; RV64V-NEXT:    vmul.vv v20, v8, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
 ; RV64V-NEXT:    lui a0, 32768
-; RV64V-NEXT:    vmul.vv v20, v8, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
 ; RV64V-NEXT:    lui a0, 65536
-; RV64V-NEXT:    vmul.vv v20, v8, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
 ; RV64V-NEXT:    lui a0, 131072
-; RV64V-NEXT:    vmul.vv v20, v8, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
 ; RV64V-NEXT:    lui a0, 262144
-; RV64V-NEXT:    vmul.vv v20, v8, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v16, a0
 ; RV64V-NEXT:    lui a0, 524288
-; RV64V-NEXT:    vand.vx v12, v12, a0
-; RV64V-NEXT:    vmul.vv v20, v8, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vmul.vv v8, v8, v12
-; RV64V-NEXT:    vxor.vv v8, v16, v8
+; RV64V-NEXT:    vand.vx v16, v16, a0
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vmul.vv v8, v8, v16
+; RV64V-NEXT:    vxor.vv v8, v24, v8
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC64-LABEL: clmul_nxv8i32_vx:
+; RV32ZVBC64-LABEL: clmul_nxv16i32_vx:
 ; RV32ZVBC64:       # %bb.0:
-; RV32ZVBC64-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV32ZVBC64-NEXT:    vmv.v.x v24, a0
-; RV32ZVBC64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV32ZVBC64-NEXT:    vzext.vf2 v16, v8
-; RV32ZVBC64-NEXT:    vzext.vf2 v8, v24
-; RV32ZVBC64-NEXT:    vclmul.vv v16, v16, v8
-; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; RV32ZVBC64-NEXT:    vnsrl.wi v8, v16, 0
+; RV32ZVBC64-NEXT:    addi sp, sp, -32
+; RV32ZVBC64-NEXT:    sw s0, 28(sp) # 4-byte Folded Spill
+; RV32ZVBC64-NEXT:    sw s1, 24(sp) # 4-byte Folded Spill
+; RV32ZVBC64-NEXT:    sw s2, 20(sp) # 4-byte Folded Spill
+; RV32ZVBC64-NEXT:    sw s3, 16(sp) # 4-byte Folded Spill
+; RV32ZVBC64-NEXT:    sw s4, 12(sp) # 4-byte Folded Spill
+; RV32ZVBC64-NEXT:    sw s5, 8(sp) # 4-byte Folded Spill
+; RV32ZVBC64-NEXT:    sw s6, 4(sp) # 4-byte Folded Spill
+; RV32ZVBC64-NEXT:    sw s7, 0(sp) # 4-byte Folded Spill
+; RV32ZVBC64-NEXT:    andi s6, a0, 2
+; RV32ZVBC64-NEXT:    andi s5, a0, 1
+; RV32ZVBC64-NEXT:    andi s3, a0, 4
+; RV32ZVBC64-NEXT:    andi s1, a0, 8
+; RV32ZVBC64-NEXT:    andi t6, a0, 16
+; RV32ZVBC64-NEXT:    andi t4, a0, 32
+; RV32ZVBC64-NEXT:    andi t2, a0, 64
+; RV32ZVBC64-NEXT:    andi t0, a0, 128
+; RV32ZVBC64-NEXT:    andi a6, a0, 256
+; RV32ZVBC64-NEXT:    andi a4, a0, 512
+; RV32ZVBC64-NEXT:    andi a2, a0, 1024
+; RV32ZVBC64-NEXT:    li a1, 1
+; RV32ZVBC64-NEXT:    lui a3, 1
+; RV32ZVBC64-NEXT:    lui a5, 2
+; RV32ZVBC64-NEXT:    lui a7, 4
+; RV32ZVBC64-NEXT:    lui t1, 8
+; RV32ZVBC64-NEXT:    lui t3, 16
+; RV32ZVBC64-NEXT:    lui t5, 32
+; RV32ZVBC64-NEXT:    lui s0, 64
+; RV32ZVBC64-NEXT:    lui s2, 128
+; RV32ZVBC64-NEXT:    lui s4, 256
+; RV32ZVBC64-NEXT:    vsetvli s7, zero, e32, m8, ta, ma
+; RV32ZVBC64-NEXT:    vmul.vx v16, v8, s6
+; RV32ZVBC64-NEXT:    lui s6, 512
+; RV32ZVBC64-NEXT:    vmul.vx v24, v8, s5
+; RV32ZVBC64-NEXT:    lui s5, 1024
+; RV32ZVBC64-NEXT:    vxor.vv v16, v24, v16
+; RV32ZVBC64-NEXT:    vmul.vx v24, v8, s3
+; RV32ZVBC64-NEXT:    lui s3, 2048
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC64-NEXT:    vmul.vx v24, v8, s1
+; RV32ZVBC64-NEXT:    lui s1, 4096
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC64-NEXT:    vmul.vx v24, v8, t6
+; RV32ZVBC64-NEXT:    lui t6, 8192
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC64-NEXT:    vmul.vx v24, v8, t4
+; RV32ZVBC64-NEXT:    lui t4, 16384
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC64-NEXT:    vmul.vx v24, v8, t2
+; RV32ZVBC64-NEXT:    lui t2, 32768
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC64-NEXT:    vmul.vx v24, v8, t0
+; RV32ZVBC64-NEXT:    lui t0, 65536
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC64-NEXT:    vmul.vx v24, v8, a6
+; RV32ZVBC64-NEXT:    lui a6, 131072
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC64-NEXT:    vmul.vx v24, v8, a4
+; RV32ZVBC64-NEXT:    lui a4, 262144
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC64-NEXT:    vmul.vx v24, v8, a2
+; RV32ZVBC64-NEXT:    lui a2, 524288
+; RV32ZVBC64-NEXT:    slli a1, a1, 11
+; RV32ZVBC64-NEXT:    and a3, a0, a3
+; RV32ZVBC64-NEXT:    and a5, a0, a5
+; RV32ZVBC64-NEXT:    and a7, a0, a7
+; RV32ZVBC64-NEXT:    and t1, a0, t1
+; RV32ZVBC64-NEXT:    and t3, a0, t3
+; RV32ZVBC64-NEXT:    and t5, a0, t5
+; RV32ZVBC64-NEXT:    and s0, a0, s0
+; RV32ZVBC64-NEXT:    and s2, a0, s2
+; RV32ZVBC64-NEXT:    and s4, a0, s4
+; RV32ZVBC64-NEXT:    and s6, a0, s6
+; RV32ZVBC64-NEXT:    and s5, a0, s5
+; RV32ZVBC64-NEXT:    and s3, a0, s3
+; RV32ZVBC64-NEXT:    and s1, a0, s1
+; RV32ZVBC64-NEXT:    and t6, a0, t6
+; RV32ZVBC64-NEXT:    and t4, a0, t4
+; RV32ZVBC64-NEXT:    and t2, a0, t2
+; RV32ZVBC64-NEXT:    and t0, a0, t0
+; RV32ZVBC64-NEXT:    and a6, a0, a6
+; RV32ZVBC64-NEXT:    and a4, a0, a4
+; RV32ZVBC64-NEXT:    and a2, a0, a2
+; RV32ZVBC64-NEXT:    and a0, a0, a1
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC64-NEXT:    vmul.vx v24, v8, a0
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC64-NEXT:    vmul.vx v24, v8, a3
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC64-NEXT:    vmul.vx v24, v8, a5
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC64-NEXT:    vmul.vx v24, v8, a7
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC64-NEXT:    vmul.vx v24, v8, t1
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC64-NEXT:    vmul.vx v24, v8, t3
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC64-NEXT:    vmul.vx v24, v8, t5
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC64-NEXT:    vmul.vx v24, v8, s0
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC64-NEXT:    vmul.vx v24, v8, s2
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC64-NEXT:    vmul.vx v24, v8, s4
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC64-NEXT:    vmul.vx v24, v8, s6
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC64-NEXT:    vmul.vx v24, v8, s5
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC64-NEXT:    vmul.vx v24, v8, s3
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC64-NEXT:    vmul.vx v24, v8, s1
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC64-NEXT:    vmul.vx v24, v8, t6
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC64-NEXT:    vmul.vx v24, v8, t4
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC64-NEXT:    vmul.vx v24, v8, t2
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC64-NEXT:    vmul.vx v24, v8, t0
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC64-NEXT:    vmul.vx v24, v8, a6
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC64-NEXT:    vmul.vx v24, v8, a4
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC64-NEXT:    vmul.vx v8, v8, a2
+; RV32ZVBC64-NEXT:    vxor.vv v8, v16, v8
+; RV32ZVBC64-NEXT:    lw s0, 28(sp) # 4-byte Folded Reload
+; RV32ZVBC64-NEXT:    lw s1, 24(sp) # 4-byte Folded Reload
+; RV32ZVBC64-NEXT:    lw s2, 20(sp) # 4-byte Folded Reload
+; RV32ZVBC64-NEXT:    lw s3, 16(sp) # 4-byte Folded Reload
+; RV32ZVBC64-NEXT:    lw s4, 12(sp) # 4-byte Folded Reload
+; RV32ZVBC64-NEXT:    lw s5, 8(sp) # 4-byte Folded Reload
+; RV32ZVBC64-NEXT:    lw s6, 4(sp) # 4-byte Folded Reload
+; RV32ZVBC64-NEXT:    lw s7, 0(sp) # 4-byte Folded Reload
+; RV32ZVBC64-NEXT:    addi sp, sp, 32
 ; RV32ZVBC64-NEXT:    ret
 ;
-; RV64ZVBC64-LABEL: clmul_nxv8i32_vx:
+; RV64ZVBC64-LABEL: clmul_nxv16i32_vx:
 ; RV64ZVBC64:       # %bb.0:
-; RV64ZVBC64-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV64ZVBC64-NEXT:    vmv.v.x v24, a0
-; RV64ZVBC64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV64ZVBC64-NEXT:    vzext.vf2 v16, v8
-; RV64ZVBC64-NEXT:    vzext.vf2 v8, v24
-; RV64ZVBC64-NEXT:    vclmul.vv v16, v16, v8
-; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; RV64ZVBC64-NEXT:    vnsrl.wi v8, v16, 0
+; RV64ZVBC64-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV64ZVBC64-NEXT:    vmv.v.x v16, a0
+; RV64ZVBC64-NEXT:    li a0, 16
+; RV64ZVBC64-NEXT:    vand.vi v24, v16, 2
+; RV64ZVBC64-NEXT:    vand.vi v0, v16, 1
+; RV64ZVBC64-NEXT:    vmul.vv v24, v8, v24
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v0, v24
+; RV64ZVBC64-NEXT:    vand.vi v0, v16, 4
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vi v0, v16, 8
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    li a0, 32
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    li a0, 64
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    li a0, 128
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    li a0, 256
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    li a0, 512
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    li a0, 1024
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    li a0, 1
+; RV64ZVBC64-NEXT:    slli a0, a0, 11
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    lui a0, 1
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    lui a0, 2
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    lui a0, 4
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    lui a0, 8
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    lui a0, 16
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    lui a0, 32
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    lui a0, 64
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    lui a0, 128
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    lui a0, 256
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    lui a0, 512
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    lui a0, 1024
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    lui a0, 2048
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    lui a0, 4096
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    lui a0, 8192
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    lui a0, 16384
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    lui a0, 32768
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    lui a0, 65536
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    lui a0, 131072
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    lui a0, 262144
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v16, a0
+; RV64ZVBC64-NEXT:    lui a0, 524288
+; RV64ZVBC64-NEXT:    vand.vx v16, v16, a0
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vmul.vv v8, v8, v16
+; RV64ZVBC64-NEXT:    vxor.vv v8, v24, v8
 ; RV64ZVBC64-NEXT:    ret
 ;
-; RV32ZVBC32-LABEL: clmul_nxv8i32_vx:
+; RV32ZVBC32-LABEL: clmul_nxv16i32_vx:
 ; RV32ZVBC32:       # %bb.0:
-; RV32ZVBC32-NEXT:    addi sp, sp, -32
-; RV32ZVBC32-NEXT:    sw s0, 28(sp) # 4-byte Folded Spill
-; RV32ZVBC32-NEXT:    sw s1, 24(sp) # 4-byte Folded Spill
-; RV32ZVBC32-NEXT:    sw s2, 20(sp) # 4-byte Folded Spill
-; RV32ZVBC32-NEXT:    sw s3, 16(sp) # 4-byte Folded Spill
-; RV32ZVBC32-NEXT:    sw s4, 12(sp) # 4-byte Folded Spill
-; RV32ZVBC32-NEXT:    sw s5, 8(sp) # 4-byte Folded Spill
-; RV32ZVBC32-NEXT:    sw s6, 4(sp) # 4-byte Folded Spill
-; RV32ZVBC32-NEXT:    sw s7, 0(sp) # 4-byte Folded Spill
-; RV32ZVBC32-NEXT:    andi s6, a0, 2
-; RV32ZVBC32-NEXT:    andi s5, a0, 1
-; RV32ZVBC32-NEXT:    andi s3, a0, 4
-; RV32ZVBC32-NEXT:    andi s1, a0, 8
-; RV32ZVBC32-NEXT:    andi t6, a0, 16
-; RV32ZVBC32-NEXT:    andi t4, a0, 32
-; RV32ZVBC32-NEXT:    andi t2, a0, 64
-; RV32ZVBC32-NEXT:    andi t0, a0, 128
-; RV32ZVBC32-NEXT:    andi a6, a0, 256
-; RV32ZVBC32-NEXT:    andi a4, a0, 512
-; RV32ZVBC32-NEXT:    andi a2, a0, 1024
-; RV32ZVBC32-NEXT:    li a1, 1
-; RV32ZVBC32-NEXT:    lui a3, 1
-; RV32ZVBC32-NEXT:    lui a5, 2
-; RV32ZVBC32-NEXT:    lui a7, 4
-; RV32ZVBC32-NEXT:    lui t1, 8
-; RV32ZVBC32-NEXT:    lui t3, 16
-; RV32ZVBC32-NEXT:    lui t5, 32
-; RV32ZVBC32-NEXT:    lui s0, 64
-; RV32ZVBC32-NEXT:    lui s2, 128
-; RV32ZVBC32-NEXT:    lui s4, 256
-; RV32ZVBC32-NEXT:    vsetvli s7, zero, e32, m4, ta, ma
-; RV32ZVBC32-NEXT:    vmul.vx v12, v8, s6
-; RV32ZVBC32-NEXT:    lui s6, 512
-; RV32ZVBC32-NEXT:    vmul.vx v16, v8, s5
-; RV32ZVBC32-NEXT:    lui s5, 1024
-; RV32ZVBC32-NEXT:    vxor.vv v12, v16, v12
-; RV32ZVBC32-NEXT:    vmul.vx v16, v8, s3
-; RV32ZVBC32-NEXT:    lui s3, 2048
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
-; RV32ZVBC32-NEXT:    vmul.vx v16, v8, s1
-; RV32ZVBC32-NEXT:    lui s1, 4096
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
-; RV32ZVBC32-NEXT:    vmul.vx v16, v8, t6
-; RV32ZVBC32-NEXT:    lui t6, 8192
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
-; RV32ZVBC32-NEXT:    vmul.vx v16, v8, t4
-; RV32ZVBC32-NEXT:    lui t4, 16384
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
-; RV32ZVBC32-NEXT:    vmul.vx v16, v8, t2
-; RV32ZVBC32-NEXT:    lui t2, 32768
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
-; RV32ZVBC32-NEXT:    vmul.vx v16, v8, t0
-; RV32ZVBC32-NEXT:    lui t0, 65536
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
-; RV32ZVBC32-NEXT:    vmul.vx v16, v8, a6
-; RV32ZVBC32-NEXT:    lui a6, 131072
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
-; RV32ZVBC32-NEXT:    vmul.vx v16, v8, a4
-; RV32ZVBC32-NEXT:    lui a4, 262144
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
-; RV32ZVBC32-NEXT:    vmul.vx v16, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 524288
-; RV32ZVBC32-NEXT:    slli a1, a1, 11
-; RV32ZVBC32-NEXT:    and a3, a0, a3
-; RV32ZVBC32-NEXT:    and a5, a0, a5
-; RV32ZVBC32-NEXT:    and a7, a0, a7
-; RV32ZVBC32-NEXT:    and t1, a0, t1
-; RV32ZVBC32-NEXT:    and t3, a0, t3
-; RV32ZVBC32-NEXT:    and t5, a0, t5
-; RV32ZVBC32-NEXT:    and s0, a0, s0
-; RV32ZVBC32-NEXT:    and s2, a0, s2
-; RV32ZVBC32-NEXT:    and s4, a0, s4
-; RV32ZVBC32-NEXT:    and s6, a0, s6
-; RV32ZVBC32-NEXT:    and s5, a0, s5
-; RV32ZVBC32-NEXT:    and s3, a0, s3
-; RV32ZVBC32-NEXT:    and s1, a0, s1
-; RV32ZVBC32-NEXT:    and t6, a0, t6
-; RV32ZVBC32-NEXT:    and t4, a0, t4
-; RV32ZVBC32-NEXT:    and t2, a0, t2
-; RV32ZVBC32-NEXT:    and t0, a0, t0
-; RV32ZVBC32-NEXT:    and a6, a0, a6
-; RV32ZVBC32-NEXT:    and a4, a0, a4
-; RV32ZVBC32-NEXT:    and a2, a0, a2
-; RV32ZVBC32-NEXT:    and a0, a0, a1
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
-; RV32ZVBC32-NEXT:    vmul.vx v16, v8, a0
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
-; RV32ZVBC32-NEXT:    vmul.vx v16, v8, a3
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
-; RV32ZVBC32-NEXT:    vmul.vx v16, v8, a5
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
-; RV32ZVBC32-NEXT:    vmul.vx v16, v8, a7
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
-; RV32ZVBC32-NEXT:    vmul.vx v16, v8, t1
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
-; RV32ZVBC32-NEXT:    vmul.vx v16, v8, t3
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
-; RV32ZVBC32-NEXT:    vmul.vx v16, v8, t5
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
-; RV32ZVBC32-NEXT:    vmul.vx v16, v8, s0
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
-; RV32ZVBC32-NEXT:    vmul.vx v16, v8, s2
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
-; RV32ZVBC32-NEXT:    vmul.vx v16, v8, s4
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
-; RV32ZVBC32-NEXT:    vmul.vx v16, v8, s6
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
-; RV32ZVBC32-NEXT:    vmul.vx v16, v8, s5
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
-; RV32ZVBC32-NEXT:    vmul.vx v16, v8, s3
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
-; RV32ZVBC32-NEXT:    vmul.vx v16, v8, s1
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
-; RV32ZVBC32-NEXT:    vmul.vx v16, v8, t6
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
-; RV32ZVBC32-NEXT:    vmul.vx v16, v8, t4
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
-; RV32ZVBC32-NEXT:    vmul.vx v16, v8, t2
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
-; RV32ZVBC32-NEXT:    vmul.vx v16, v8, t0
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
-; RV32ZVBC32-NEXT:    vmul.vx v16, v8, a6
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
-; RV32ZVBC32-NEXT:    vmul.vx v16, v8, a4
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
-; RV32ZVBC32-NEXT:    vmul.vx v8, v8, a2
-; RV32ZVBC32-NEXT:    vxor.vv v8, v12, v8
-; RV32ZVBC32-NEXT:    lw s0, 28(sp) # 4-byte Folded Reload
-; RV32ZVBC32-NEXT:    lw s1, 24(sp) # 4-byte Folded Reload
-; RV32ZVBC32-NEXT:    lw s2, 20(sp) # 4-byte Folded Reload
-; RV32ZVBC32-NEXT:    lw s3, 16(sp) # 4-byte Folded Reload
-; RV32ZVBC32-NEXT:    lw s4, 12(sp) # 4-byte Folded Reload
-; RV32ZVBC32-NEXT:    lw s5, 8(sp) # 4-byte Folded Reload
-; RV32ZVBC32-NEXT:    lw s6, 4(sp) # 4-byte Folded Reload
-; RV32ZVBC32-NEXT:    lw s7, 0(sp) # 4-byte Folded Reload
-; RV32ZVBC32-NEXT:    addi sp, sp, 32
+; RV32ZVBC32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32ZVBC32-NEXT:    vclmul.vx v8, v8, a0
 ; RV32ZVBC32-NEXT:    ret
 ;
-; RV64ZVBC32-LABEL: clmul_nxv8i32_vx:
+; RV64ZVBC32-LABEL: clmul_nxv16i32_vx:
 ; RV64ZVBC32:       # %bb.0:
-; RV64ZVBC32-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV64ZVBC32-NEXT:    vmv.v.x v12, a0
-; RV64ZVBC32-NEXT:    li a0, 16
-; RV64ZVBC32-NEXT:    vand.vi v16, v12, 2
-; RV64ZVBC32-NEXT:    vand.vi v20, v12, 1
-; RV64ZVBC32-NEXT:    vmul.vv v16, v8, v16
-; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v20, v16
-; RV64ZVBC32-NEXT:    vand.vi v20, v12, 4
-; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vi v20, v12, 8
-; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV64ZVBC32-NEXT:    li a0, 32
-; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV64ZVBC32-NEXT:    li a0, 64
-; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV64ZVBC32-NEXT:    li a0, 128
-; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV64ZVBC32-NEXT:    li a0, 256
-; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV64ZVBC32-NEXT:    li a0, 512
-; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV64ZVBC32-NEXT:    li a0, 1024
-; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV64ZVBC32-NEXT:    li a0, 1
-; RV64ZVBC32-NEXT:    slli a0, a0, 11
-; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV64ZVBC32-NEXT:    lui a0, 1
-; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV64ZVBC32-NEXT:    lui a0, 2
-; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV64ZVBC32-NEXT:    lui a0, 4
-; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV64ZVBC32-NEXT:    lui a0, 8
-; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV64ZVBC32-NEXT:    lui a0, 16
-; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV64ZVBC32-NEXT:    lui a0, 32
-; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV64ZVBC32-NEXT:    lui a0, 64
-; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV64ZVBC32-NEXT:    lui a0, 128
-; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV64ZVBC32-NEXT:    lui a0, 256
-; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV64ZVBC32-NEXT:    lui a0, 512
-; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV64ZVBC32-NEXT:    lui a0, 1024
-; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV64ZVBC32-NEXT:    lui a0, 2048
-; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV64ZVBC32-NEXT:    lui a0, 4096
-; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV64ZVBC32-NEXT:    lui a0, 8192
-; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV64ZVBC32-NEXT:    lui a0, 16384
-; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV64ZVBC32-NEXT:    lui a0, 32768
-; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV64ZVBC32-NEXT:    lui a0, 65536
-; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV64ZVBC32-NEXT:    lui a0, 131072
-; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV64ZVBC32-NEXT:    lui a0, 262144
-; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v12, a0
-; RV64ZVBC32-NEXT:    lui a0, 524288
-; RV64ZVBC32-NEXT:    vand.vx v12, v12, a0
-; RV64ZVBC32-NEXT:    vmul.vv v20, v8, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vmul.vv v8, v8, v12
-; RV64ZVBC32-NEXT:    vxor.vv v8, v16, v8
+; RV64ZVBC32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV64ZVBC32-NEXT:    vclmul.vx v8, v8, a0
 ; RV64ZVBC32-NEXT:    ret
-  %elt.head = insertelement <vscale x 8 x i32> poison, i32 %b, i32 0
-  %vb = shufflevector <vscale x 8 x i32> %elt.head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
-  %v = call <vscale x 8 x i32> @llvm.clmul.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %vb)
-  ret <vscale x 8 x i32> %v
-}
-
-define <vscale x 16 x i32> @clmul_nxv16i32_vv(<vscale x 16 x i32> %va, <vscale x 16 x i32> %vb) nounwind {
-; CHECK-LABEL: clmul_nxv16i32_vv:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vand.vi v24, v16, 2
-; CHECK-NEXT:    vand.vi v0, v16, 1
-; CHECK-NEXT:    vmul.vv v24, v8, v24
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v0, v24
-; CHECK-NEXT:    vand.vi v0, v16, 4
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vi v0, v16, 8
-; CHECK-NEXT:    li a0, 16
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    li a0, 64
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    li a0, 128
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    li a0, 256
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    li a0, 512
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    li a0, 1024
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    li a0, 1
-; CHECK-NEXT:    slli a0, a0, 11
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    lui a0, 2
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    lui a0, 4
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    lui a0, 8
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    lui a0, 16
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    lui a0, 32
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    lui a0, 64
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    lui a0, 128
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    lui a0, 256
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    lui a0, 512
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    lui a0, 1024
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    lui a0, 2048
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    lui a0, 4096
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    lui a0, 8192
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    lui a0, 16384
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    lui a0, 32768
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    lui a0, 65536
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    lui a0, 131072
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    lui a0, 262144
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    lui a0, 524288
-; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vmul.vv v8, v8, v16
-; CHECK-NEXT:    vxor.vv v8, v24, v8
-; CHECK-NEXT:    ret
-  %v = call <vscale x 16 x i32> @llvm.clmul.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %vb)
-  ret <vscale x 16 x i32> %v
-}
-
-define <vscale x 16 x i32> @clmul_nxv16i32_vx(<vscale x 16 x i32> %va, i32 %b) nounwind {
-; RV32-LABEL: clmul_nxv16i32_vx:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -32
-; RV32-NEXT:    sw s0, 28(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s1, 24(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s2, 20(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s3, 16(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s4, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s5, 8(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s6, 4(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s7, 0(sp) # 4-byte Folded Spill
-; RV32-NEXT:    andi s6, a0, 2
-; RV32-NEXT:    andi s5, a0, 1
-; RV32-NEXT:    andi s3, a0, 4
-; RV32-NEXT:    andi s1, a0, 8
-; RV32-NEXT:    andi t6, a0, 16
-; RV32-NEXT:    andi t4, a0, 32
-; RV32-NEXT:    andi t2, a0, 64
-; RV32-NEXT:    andi t0, a0, 128
-; RV32-NEXT:    andi a6, a0, 256
-; RV32-NEXT:    andi a4, a0, 512
-; RV32-NEXT:    andi a2, a0, 1024
-; RV32-NEXT:    li a1, 1
-; RV32-NEXT:    lui a3, 1
-; RV32-NEXT:    lui a5, 2
-; RV32-NEXT:    lui a7, 4
-; RV32-NEXT:    lui t1, 8
-; RV32-NEXT:    lui t3, 16
-; RV32-NEXT:    lui t5, 32
-; RV32-NEXT:    lui s0, 64
-; RV32-NEXT:    lui s2, 128
-; RV32-NEXT:    lui s4, 256
-; RV32-NEXT:    vsetvli s7, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmul.vx v16, v8, s6
-; RV32-NEXT:    lui s6, 512
-; RV32-NEXT:    vmul.vx v24, v8, s5
-; RV32-NEXT:    lui s5, 1024
-; RV32-NEXT:    vxor.vv v16, v24, v16
-; RV32-NEXT:    vmul.vx v24, v8, s3
-; RV32-NEXT:    lui s3, 2048
-; RV32-NEXT:    vxor.vv v16, v16, v24
-; RV32-NEXT:    vmul.vx v24, v8, s1
-; RV32-NEXT:    lui s1, 4096
-; RV32-NEXT:    vxor.vv v16, v16, v24
-; RV32-NEXT:    vmul.vx v24, v8, t6
-; RV32-NEXT:    lui t6, 8192
-; RV32-NEXT:    vxor.vv v16, v16, v24
-; RV32-NEXT:    vmul.vx v24, v8, t4
-; RV32-NEXT:    lui t4, 16384
-; RV32-NEXT:    vxor.vv v16, v16, v24
-; RV32-NEXT:    vmul.vx v24, v8, t2
-; RV32-NEXT:    lui t2, 32768
-; RV32-NEXT:    vxor.vv v16, v16, v24
-; RV32-NEXT:    vmul.vx v24, v8, t0
-; RV32-NEXT:    lui t0, 65536
-; RV32-NEXT:    vxor.vv v16, v16, v24
-; RV32-NEXT:    vmul.vx v24, v8, a6
-; RV32-NEXT:    lui a6, 131072
-; RV32-NEXT:    vxor.vv v16, v16, v24
-; RV32-NEXT:    vmul.vx v24, v8, a4
-; RV32-NEXT:    lui a4, 262144
-; RV32-NEXT:    vxor.vv v16, v16, v24
-; RV32-NEXT:    vmul.vx v24, v8, a2
-; RV32-NEXT:    lui a2, 524288
-; RV32-NEXT:    slli a1, a1, 11
-; RV32-NEXT:    and a3, a0, a3
-; RV32-NEXT:    and a5, a0, a5
-; RV32-NEXT:    and a7, a0, a7
-; RV32-NEXT:    and t1, a0, t1
-; RV32-NEXT:    and t3, a0, t3
-; RV32-NEXT:    and t5, a0, t5
-; RV32-NEXT:    and s0, a0, s0
-; RV32-NEXT:    and s2, a0, s2
-; RV32-NEXT:    and s4, a0, s4
-; RV32-NEXT:    and s6, a0, s6
-; RV32-NEXT:    and s5, a0, s5
-; RV32-NEXT:    and s3, a0, s3
-; RV32-NEXT:    and s1, a0, s1
-; RV32-NEXT:    and t6, a0, t6
-; RV32-NEXT:    and t4, a0, t4
-; RV32-NEXT:    and t2, a0, t2
-; RV32-NEXT:    and t0, a0, t0
-; RV32-NEXT:    and a6, a0, a6
-; RV32-NEXT:    and a4, a0, a4
-; RV32-NEXT:    and a2, a0, a2
-; RV32-NEXT:    and a0, a0, a1
-; RV32-NEXT:    vxor.vv v16, v16, v24
-; RV32-NEXT:    vmul.vx v24, v8, a0
-; RV32-NEXT:    vxor.vv v16, v16, v24
-; RV32-NEXT:    vmul.vx v24, v8, a3
-; RV32-NEXT:    vxor.vv v16, v16, v24
-; RV32-NEXT:    vmul.vx v24, v8, a5
-; RV32-NEXT:    vxor.vv v16, v16, v24
-; RV32-NEXT:    vmul.vx v24, v8, a7
-; RV32-NEXT:    vxor.vv v16, v16, v24
-; RV32-NEXT:    vmul.vx v24, v8, t1
-; RV32-NEXT:    vxor.vv v16, v16, v24
-; RV32-NEXT:    vmul.vx v24, v8, t3
-; RV32-NEXT:    vxor.vv v16, v16, v24
-; RV32-NEXT:    vmul.vx v24, v8, t5
-; RV32-NEXT:    vxor.vv v16, v16, v24
-; RV32-NEXT:    vmul.vx v24, v8, s0
-; RV32-NEXT:    vxor.vv v16, v16, v24
-; RV32-NEXT:    vmul.vx v24, v8, s2
-; RV32-NEXT:    vxor.vv v16, v16, v24
-; RV32-NEXT:    vmul.vx v24, v8, s4
-; RV32-NEXT:    vxor.vv v16, v16, v24
-; RV32-NEXT:    vmul.vx v24, v8, s6
-; RV32-NEXT:    vxor.vv v16, v16, v24
-; RV32-NEXT:    vmul.vx v24, v8, s5
-; RV32-NEXT:    vxor.vv v16, v16, v24
-; RV32-NEXT:    vmul.vx v24, v8, s3
-; RV32-NEXT:    vxor.vv v16, v16, v24
-; RV32-NEXT:    vmul.vx v24, v8, s1
-; RV32-NEXT:    vxor.vv v16, v16, v24
-; RV32-NEXT:    vmul.vx v24, v8, t6
-; RV32-NEXT:    vxor.vv v16, v16, v24
-; RV32-NEXT:    vmul.vx v24, v8, t4
-; RV32-NEXT:    vxor.vv v16, v16, v24
-; RV32-NEXT:    vmul.vx v24, v8, t2
-; RV32-NEXT:    vxor.vv v16, v16, v24
-; RV32-NEXT:    vmul.vx v24, v8, t0
-; RV32-NEXT:    vxor.vv v16, v16, v24
-; RV32-NEXT:    vmul.vx v24, v8, a6
-; RV32-NEXT:    vxor.vv v16, v16, v24
-; RV32-NEXT:    vmul.vx v24, v8, a4
-; RV32-NEXT:    vxor.vv v16, v16, v24
-; RV32-NEXT:    vmul.vx v8, v8, a2
-; RV32-NEXT:    vxor.vv v8, v16, v8
-; RV32-NEXT:    lw s0, 28(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s1, 24(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s2, 20(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s3, 16(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s4, 12(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s5, 8(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s6, 4(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s7, 0(sp) # 4-byte Folded Reload
-; RV32-NEXT:    addi sp, sp, 32
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: clmul_nxv16i32_vx:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; RV64-NEXT:    vmv.v.x v16, a0
-; RV64-NEXT:    li a0, 16
-; RV64-NEXT:    vand.vi v24, v16, 2
-; RV64-NEXT:    vand.vi v0, v16, 1
-; RV64-NEXT:    vmul.vv v24, v8, v24
-; RV64-NEXT:    vmul.vv v0, v8, v0
-; RV64-NEXT:    vxor.vv v24, v0, v24
-; RV64-NEXT:    vand.vi v0, v16, 4
-; RV64-NEXT:    vmul.vv v0, v8, v0
-; RV64-NEXT:    vxor.vv v24, v24, v0
-; RV64-NEXT:    vand.vi v0, v16, 8
-; RV64-NEXT:    vmul.vv v0, v8, v0
-; RV64-NEXT:    vxor.vv v24, v24, v0
-; RV64-NEXT:    vand.vx v0, v16, a0
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vmul.vv v0, v8, v0
-; RV64-NEXT:    vxor.vv v24, v24, v0
-; RV64-NEXT:    vand.vx v0, v16, a0
-; RV64-NEXT:    li a0, 64
-; RV64-NEXT:    vmul.vv v0, v8, v0
-; RV64-NEXT:    vxor.vv v24, v24, v0
-; RV64-NEXT:    vand.vx v0, v16, a0
-; RV64-NEXT:    li a0, 128
-; RV64-NEXT:    vmul.vv v0, v8, v0
-; RV64-NEXT:    vxor.vv v24, v24, v0
-; RV64-NEXT:    vand.vx v0, v16, a0
-; RV64-NEXT:    li a0, 256
-; RV64-NEXT:    vmul.vv v0, v8, v0
-; RV64-NEXT:    vxor.vv v24, v24, v0
-; RV64-NEXT:    vand.vx v0, v16, a0
-; RV64-NEXT:    li a0, 512
-; RV64-NEXT:    vmul.vv v0, v8, v0
-; RV64-NEXT:    vxor.vv v24, v24, v0
-; RV64-NEXT:    vand.vx v0, v16, a0
-; RV64-NEXT:    li a0, 1024
-; RV64-NEXT:    vmul.vv v0, v8, v0
-; RV64-NEXT:    vxor.vv v24, v24, v0
-; RV64-NEXT:    vand.vx v0, v16, a0
-; RV64-NEXT:    li a0, 1
-; RV64-NEXT:    slli a0, a0, 11
-; RV64-NEXT:    vmul.vv v0, v8, v0
-; RV64-NEXT:    vxor.vv v24, v24, v0
-; RV64-NEXT:    vand.vx v0, v16, a0
-; RV64-NEXT:    lui a0, 1
-; RV64-NEXT:    vmul.vv v0, v8, v0
-; RV64-NEXT:    vxor.vv v24, v24, v0
-; RV64-NEXT:    vand.vx v0, v16, a0
-; RV64-NEXT:    lui a0, 2
-; RV64-NEXT:    vmul.vv v0, v8, v0
-; RV64-NEXT:    vxor.vv v24, v24, v0
-; RV64-NEXT:    vand.vx v0, v16, a0
-; RV64-NEXT:    lui a0, 4
-; RV64-NEXT:    vmul.vv v0, v8, v0
-; RV64-NEXT:    vxor.vv v24, v24, v0
-; RV64-NEXT:    vand.vx v0, v16, a0
-; RV64-NEXT:    lui a0, 8
-; RV64-NEXT:    vmul.vv v0, v8, v0
-; RV64-NEXT:    vxor.vv v24, v24, v0
-; RV64-NEXT:    vand.vx v0, v16, a0
-; RV64-NEXT:    lui a0, 16
-; RV64-NEXT:    vmul.vv v0, v8, v0
-; RV64-NEXT:    vxor.vv v24, v24, v0
-; RV64-NEXT:    vand.vx v0, v16, a0
-; RV64-NEXT:    lui a0, 32
-; RV64-NEXT:    vmul.vv v0, v8, v0
-; RV64-NEXT:    vxor.vv v24, v24, v0
-; RV64-NEXT:    vand.vx v0, v16, a0
-; RV64-NEXT:    lui a0, 64
-; RV64-NEXT:    vmul.vv v0, v8, v0
-; RV64-NEXT:    vxor.vv v24, v24, v0
-; RV64-NEXT:    vand.vx v0, v16, a0
-; RV64-NEXT:    lui a0, 128
-; RV64-NEXT:    vmul.vv v0, v8, v0
-; RV64-NEXT:    vxor.vv v24, v24, v0
-; RV64-NEXT:    vand.vx v0, v16, a0
-; RV64-NEXT:    lui a0, 256
-; RV64-NEXT:    vmul.vv v0, v8, v0
-; RV64-NEXT:    vxor.vv v24, v24, v0
-; RV64-NEXT:    vand.vx v0, v16, a0
-; RV64-NEXT:    lui a0, 512
-; RV64-NEXT:    vmul.vv v0, v8, v0
-; RV64-NEXT:    vxor.vv v24, v24, v0
-; RV64-NEXT:    vand.vx v0, v16, a0
-; RV64-NEXT:    lui a0, 1024
-; RV64-NEXT:    vmul.vv v0, v8, v0
-; RV64-NEXT:    vxor.vv v24, v24, v0
-; RV64-NEXT:    vand.vx v0, v16, a0
-; RV64-NEXT:    lui a0, 2048
-; RV64-NEXT:    vmul.vv v0, v8, v0
-; RV64-NEXT:    vxor.vv v24, v24, v0
-; RV64-NEXT:    vand.vx v0, v16, a0
-; RV64-NEXT:    lui a0, 4096
-; RV64-NEXT:    vmul.vv v0, v8, v0
-; RV64-NEXT:    vxor.vv v24, v24, v0
-; RV64-NEXT:    vand.vx v0, v16, a0
-; RV64-NEXT:    lui a0, 8192
-; RV64-NEXT:    vmul.vv v0, v8, v0
-; RV64-NEXT:    vxor.vv v24, v24, v0
-; RV64-NEXT:    vand.vx v0, v16, a0
-; RV64-NEXT:    lui a0, 16384
-; RV64-NEXT:    vmul.vv v0, v8, v0
-; RV64-NEXT:    vxor.vv v24, v24, v0
-; RV64-NEXT:    vand.vx v0, v16, a0
-; RV64-NEXT:    lui a0, 32768
-; RV64-NEXT:    vmul.vv v0, v8, v0
-; RV64-NEXT:    vxor.vv v24, v24, v0
-; RV64-NEXT:    vand.vx v0, v16, a0
-; RV64-NEXT:    lui a0, 65536
-; RV64-NEXT:    vmul.vv v0, v8, v0
-; RV64-NEXT:    vxor.vv v24, v24, v0
-; RV64-NEXT:    vand.vx v0, v16, a0
-; RV64-NEXT:    lui a0, 131072
-; RV64-NEXT:    vmul.vv v0, v8, v0
-; RV64-NEXT:    vxor.vv v24, v24, v0
-; RV64-NEXT:    vand.vx v0, v16, a0
-; RV64-NEXT:    lui a0, 262144
-; RV64-NEXT:    vmul.vv v0, v8, v0
-; RV64-NEXT:    vxor.vv v24, v24, v0
-; RV64-NEXT:    vand.vx v0, v16, a0
-; RV64-NEXT:    lui a0, 524288
-; RV64-NEXT:    vand.vx v16, v16, a0
-; RV64-NEXT:    vmul.vv v0, v8, v0
-; RV64-NEXT:    vxor.vv v24, v24, v0
-; RV64-NEXT:    vmul.vv v8, v8, v16
-; RV64-NEXT:    vxor.vv v8, v24, v8
-; RV64-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x i32> poison, i32 %b, i32 0
   %vb = shufflevector <vscale x 16 x i32> %elt.head, <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
   %v = call <vscale x 16 x i32> @llvm.clmul.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %vb)
@@ -31532,5 +30312,8 @@ define <vscale x 1 x i64> @clmul_nxv1i64_vx_mask(<vscale x 1 x i64> %va, i64 %b,
   ret <vscale x 1 x i64> %w
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
+; RV32: {{.*}}
 ; RV32ZVBC: {{.*}}
+; RV64: {{.*}}
 ; RV64ZVBC: {{.*}}
diff --git a/llvm/test/CodeGen/RISCV/rvv/clmulh-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/clmulh-sdnode.ll
index e7492a670e3af..8b79db4fca4de 100644
--- a/llvm/test/CodeGen/RISCV/rvv/clmulh-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/clmulh-sdnode.ll
@@ -109,74 +109,14 @@ define <vscale x 1 x i8> @clmulh_nxv1i8_vv(<vscale x 1 x i8> %va, <vscale x 1 x
 ;
 ; RV32ZVBC32-LABEL: clmulh_nxv1i8_vv:
 ; RV32ZVBC32:       # %bb.0:
-; RV32ZVBC32-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; RV32ZVBC32-NEXT:    vzext.vf2 v10, v8
-; RV32ZVBC32-NEXT:    vzext.vf2 v8, v9
-; RV32ZVBC32-NEXT:    li a0, 16
-; RV32ZVBC32-NEXT:    vand.vi v9, v8, 2
-; RV32ZVBC32-NEXT:    vand.vi v11, v8, 1
-; RV32ZVBC32-NEXT:    vmul.vv v9, v10, v9
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v11, v9
-; RV32ZVBC32-NEXT:    vand.vi v11, v8, 4
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV32ZVBC32-NEXT:    vand.vi v11, v8, 8
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV32ZVBC32-NEXT:    li a0, 32
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV32ZVBC32-NEXT:    li a0, 64
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV32ZVBC32-NEXT:    li a0, 128
-; RV32ZVBC32-NEXT:    vand.vx v8, v8, a0
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
-; RV32ZVBC32-NEXT:    vxor.vv v8, v9, v8
-; RV32ZVBC32-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
-; RV32ZVBC32-NEXT:    vnsrl.wi v8, v8, 8
+; RV32ZVBC32-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
+; RV32ZVBC32-NEXT:    vclmulh.vv v8, v8, v9
 ; RV32ZVBC32-NEXT:    ret
 ;
 ; RV64ZVBC32-LABEL: clmulh_nxv1i8_vv:
 ; RV64ZVBC32:       # %bb.0:
-; RV64ZVBC32-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; RV64ZVBC32-NEXT:    vzext.vf2 v10, v8
-; RV64ZVBC32-NEXT:    vzext.vf2 v8, v9
-; RV64ZVBC32-NEXT:    li a0, 16
-; RV64ZVBC32-NEXT:    vand.vi v9, v8, 2
-; RV64ZVBC32-NEXT:    vand.vi v11, v8, 1
-; RV64ZVBC32-NEXT:    vmul.vv v9, v10, v9
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v11, v9
-; RV64ZVBC32-NEXT:    vand.vi v11, v8, 4
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vi v11, v8, 8
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV64ZVBC32-NEXT:    li a0, 32
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV64ZVBC32-NEXT:    li a0, 64
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV64ZVBC32-NEXT:    li a0, 128
-; RV64ZVBC32-NEXT:    vand.vx v8, v8, a0
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vmul.vv v8, v10, v8
-; RV64ZVBC32-NEXT:    vxor.vv v8, v9, v8
-; RV64ZVBC32-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
-; RV64ZVBC32-NEXT:    vnsrl.wi v8, v8, 8
+; RV64ZVBC32-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
+; RV64ZVBC32-NEXT:    vclmulh.vv v8, v8, v9
 ; RV64ZVBC32-NEXT:    ret
   %va.ext = zext <vscale x 1 x i8> %va to <vscale x 1 x i16>
   %vb.ext = zext <vscale x 1 x i8> %vb to <vscale x 1 x i16>
@@ -298,77 +238,13 @@ define <vscale x 1 x i8> @clmulh_nxv1i8_vx(<vscale x 1 x i8> %va, i8 %b) nounwin
 ; RV32ZVBC32-LABEL: clmulh_nxv1i8_vx:
 ; RV32ZVBC32:       # %bb.0:
 ; RV32ZVBC32-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
-; RV32ZVBC32-NEXT:    vmv.v.x v9, a0
-; RV32ZVBC32-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; RV32ZVBC32-NEXT:    vzext.vf2 v10, v8
-; RV32ZVBC32-NEXT:    li a0, 16
-; RV32ZVBC32-NEXT:    vzext.vf2 v8, v9
-; RV32ZVBC32-NEXT:    vand.vi v9, v8, 2
-; RV32ZVBC32-NEXT:    vand.vi v11, v8, 1
-; RV32ZVBC32-NEXT:    vmul.vv v9, v10, v9
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v11, v9
-; RV32ZVBC32-NEXT:    vand.vi v11, v8, 4
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV32ZVBC32-NEXT:    vand.vi v11, v8, 8
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV32ZVBC32-NEXT:    li a0, 32
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV32ZVBC32-NEXT:    li a0, 64
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV32ZVBC32-NEXT:    li a0, 128
-; RV32ZVBC32-NEXT:    vand.vx v8, v8, a0
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
-; RV32ZVBC32-NEXT:    vxor.vv v8, v9, v8
-; RV32ZVBC32-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
-; RV32ZVBC32-NEXT:    vnsrl.wi v8, v8, 8
+; RV32ZVBC32-NEXT:    vclmulh.vx v8, v8, a0
 ; RV32ZVBC32-NEXT:    ret
 ;
 ; RV64ZVBC32-LABEL: clmulh_nxv1i8_vx:
 ; RV64ZVBC32:       # %bb.0:
 ; RV64ZVBC32-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
-; RV64ZVBC32-NEXT:    vmv.v.x v9, a0
-; RV64ZVBC32-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; RV64ZVBC32-NEXT:    vzext.vf2 v10, v8
-; RV64ZVBC32-NEXT:    li a0, 16
-; RV64ZVBC32-NEXT:    vzext.vf2 v8, v9
-; RV64ZVBC32-NEXT:    vand.vi v9, v8, 2
-; RV64ZVBC32-NEXT:    vand.vi v11, v8, 1
-; RV64ZVBC32-NEXT:    vmul.vv v9, v10, v9
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v11, v9
-; RV64ZVBC32-NEXT:    vand.vi v11, v8, 4
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vi v11, v8, 8
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV64ZVBC32-NEXT:    li a0, 32
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV64ZVBC32-NEXT:    li a0, 64
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV64ZVBC32-NEXT:    li a0, 128
-; RV64ZVBC32-NEXT:    vand.vx v8, v8, a0
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vmul.vv v8, v10, v8
-; RV64ZVBC32-NEXT:    vxor.vv v8, v9, v8
-; RV64ZVBC32-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
-; RV64ZVBC32-NEXT:    vnsrl.wi v8, v8, 8
+; RV64ZVBC32-NEXT:    vclmulh.vx v8, v8, a0
 ; RV64ZVBC32-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x i8> poison, i8 %b, i32 0
   %vb = shufflevector <vscale x 1 x i8> %elt.head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
@@ -483,74 +359,14 @@ define <vscale x 2 x i8> @clmulh_nxv2i8_vv(<vscale x 2 x i8> %va, <vscale x 2 x
 ;
 ; RV32ZVBC32-LABEL: clmulh_nxv2i8_vv:
 ; RV32ZVBC32:       # %bb.0:
-; RV32ZVBC32-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; RV32ZVBC32-NEXT:    vzext.vf2 v10, v8
-; RV32ZVBC32-NEXT:    vzext.vf2 v8, v9
-; RV32ZVBC32-NEXT:    li a0, 16
-; RV32ZVBC32-NEXT:    vand.vi v9, v8, 2
-; RV32ZVBC32-NEXT:    vand.vi v11, v8, 1
-; RV32ZVBC32-NEXT:    vmul.vv v9, v10, v9
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v11, v9
-; RV32ZVBC32-NEXT:    vand.vi v11, v8, 4
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV32ZVBC32-NEXT:    vand.vi v11, v8, 8
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV32ZVBC32-NEXT:    li a0, 32
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV32ZVBC32-NEXT:    li a0, 64
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV32ZVBC32-NEXT:    li a0, 128
-; RV32ZVBC32-NEXT:    vand.vx v8, v8, a0
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
-; RV32ZVBC32-NEXT:    vxor.vv v8, v9, v8
-; RV32ZVBC32-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
-; RV32ZVBC32-NEXT:    vnsrl.wi v8, v8, 8
+; RV32ZVBC32-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
+; RV32ZVBC32-NEXT:    vclmulh.vv v8, v8, v9
 ; RV32ZVBC32-NEXT:    ret
 ;
 ; RV64ZVBC32-LABEL: clmulh_nxv2i8_vv:
 ; RV64ZVBC32:       # %bb.0:
-; RV64ZVBC32-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; RV64ZVBC32-NEXT:    vzext.vf2 v10, v8
-; RV64ZVBC32-NEXT:    vzext.vf2 v8, v9
-; RV64ZVBC32-NEXT:    li a0, 16
-; RV64ZVBC32-NEXT:    vand.vi v9, v8, 2
-; RV64ZVBC32-NEXT:    vand.vi v11, v8, 1
-; RV64ZVBC32-NEXT:    vmul.vv v9, v10, v9
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v11, v9
-; RV64ZVBC32-NEXT:    vand.vi v11, v8, 4
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vi v11, v8, 8
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV64ZVBC32-NEXT:    li a0, 32
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV64ZVBC32-NEXT:    li a0, 64
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV64ZVBC32-NEXT:    li a0, 128
-; RV64ZVBC32-NEXT:    vand.vx v8, v8, a0
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vmul.vv v8, v10, v8
-; RV64ZVBC32-NEXT:    vxor.vv v8, v9, v8
-; RV64ZVBC32-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
-; RV64ZVBC32-NEXT:    vnsrl.wi v8, v8, 8
+; RV64ZVBC32-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
+; RV64ZVBC32-NEXT:    vclmulh.vv v8, v8, v9
 ; RV64ZVBC32-NEXT:    ret
   %va.ext = zext <vscale x 2 x i8> %va to <vscale x 2 x i16>
   %vb.ext = zext <vscale x 2 x i8> %vb to <vscale x 2 x i16>
@@ -672,77 +488,13 @@ define <vscale x 2 x i8> @clmulh_nxv2i8_vx(<vscale x 2 x i8> %va, i8 %b) nounwin
 ; RV32ZVBC32-LABEL: clmulh_nxv2i8_vx:
 ; RV32ZVBC32:       # %bb.0:
 ; RV32ZVBC32-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
-; RV32ZVBC32-NEXT:    vmv.v.x v9, a0
-; RV32ZVBC32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; RV32ZVBC32-NEXT:    vzext.vf2 v10, v8
-; RV32ZVBC32-NEXT:    li a0, 16
-; RV32ZVBC32-NEXT:    vzext.vf2 v8, v9
-; RV32ZVBC32-NEXT:    vand.vi v9, v8, 2
-; RV32ZVBC32-NEXT:    vand.vi v11, v8, 1
-; RV32ZVBC32-NEXT:    vmul.vv v9, v10, v9
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v11, v9
-; RV32ZVBC32-NEXT:    vand.vi v11, v8, 4
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV32ZVBC32-NEXT:    vand.vi v11, v8, 8
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV32ZVBC32-NEXT:    li a0, 32
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV32ZVBC32-NEXT:    li a0, 64
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV32ZVBC32-NEXT:    li a0, 128
-; RV32ZVBC32-NEXT:    vand.vx v8, v8, a0
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
-; RV32ZVBC32-NEXT:    vxor.vv v8, v9, v8
-; RV32ZVBC32-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
-; RV32ZVBC32-NEXT:    vnsrl.wi v8, v8, 8
+; RV32ZVBC32-NEXT:    vclmulh.vx v8, v8, a0
 ; RV32ZVBC32-NEXT:    ret
 ;
 ; RV64ZVBC32-LABEL: clmulh_nxv2i8_vx:
 ; RV64ZVBC32:       # %bb.0:
 ; RV64ZVBC32-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
-; RV64ZVBC32-NEXT:    vmv.v.x v9, a0
-; RV64ZVBC32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; RV64ZVBC32-NEXT:    vzext.vf2 v10, v8
-; RV64ZVBC32-NEXT:    li a0, 16
-; RV64ZVBC32-NEXT:    vzext.vf2 v8, v9
-; RV64ZVBC32-NEXT:    vand.vi v9, v8, 2
-; RV64ZVBC32-NEXT:    vand.vi v11, v8, 1
-; RV64ZVBC32-NEXT:    vmul.vv v9, v10, v9
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v11, v9
-; RV64ZVBC32-NEXT:    vand.vi v11, v8, 4
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vi v11, v8, 8
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV64ZVBC32-NEXT:    li a0, 32
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV64ZVBC32-NEXT:    li a0, 64
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV64ZVBC32-NEXT:    li a0, 128
-; RV64ZVBC32-NEXT:    vand.vx v8, v8, a0
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vmul.vv v8, v10, v8
-; RV64ZVBC32-NEXT:    vxor.vv v8, v9, v8
-; RV64ZVBC32-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
-; RV64ZVBC32-NEXT:    vnsrl.wi v8, v8, 8
+; RV64ZVBC32-NEXT:    vclmulh.vx v8, v8, a0
 ; RV64ZVBC32-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x i8> poison, i8 %b, i32 0
   %vb = shufflevector <vscale x 2 x i8> %elt.head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
@@ -857,74 +609,14 @@ define <vscale x 4 x i8> @clmulh_nxv4i8_vv(<vscale x 4 x i8> %va, <vscale x 4 x
 ;
 ; RV32ZVBC32-LABEL: clmulh_nxv4i8_vv:
 ; RV32ZVBC32:       # %bb.0:
-; RV32ZVBC32-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; RV32ZVBC32-NEXT:    vzext.vf2 v10, v8
-; RV32ZVBC32-NEXT:    vzext.vf2 v8, v9
-; RV32ZVBC32-NEXT:    li a0, 16
-; RV32ZVBC32-NEXT:    vand.vi v9, v8, 2
-; RV32ZVBC32-NEXT:    vand.vi v11, v8, 1
-; RV32ZVBC32-NEXT:    vmul.vv v9, v10, v9
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v11, v9
-; RV32ZVBC32-NEXT:    vand.vi v11, v8, 4
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV32ZVBC32-NEXT:    vand.vi v11, v8, 8
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV32ZVBC32-NEXT:    li a0, 32
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV32ZVBC32-NEXT:    li a0, 64
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV32ZVBC32-NEXT:    li a0, 128
-; RV32ZVBC32-NEXT:    vand.vx v8, v8, a0
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
-; RV32ZVBC32-NEXT:    vxor.vv v8, v9, v8
-; RV32ZVBC32-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
-; RV32ZVBC32-NEXT:    vnsrl.wi v8, v8, 8
+; RV32ZVBC32-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
+; RV32ZVBC32-NEXT:    vclmulh.vv v8, v8, v9
 ; RV32ZVBC32-NEXT:    ret
 ;
 ; RV64ZVBC32-LABEL: clmulh_nxv4i8_vv:
 ; RV64ZVBC32:       # %bb.0:
-; RV64ZVBC32-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; RV64ZVBC32-NEXT:    vzext.vf2 v10, v8
-; RV64ZVBC32-NEXT:    vzext.vf2 v8, v9
-; RV64ZVBC32-NEXT:    li a0, 16
-; RV64ZVBC32-NEXT:    vand.vi v9, v8, 2
-; RV64ZVBC32-NEXT:    vand.vi v11, v8, 1
-; RV64ZVBC32-NEXT:    vmul.vv v9, v10, v9
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v11, v9
-; RV64ZVBC32-NEXT:    vand.vi v11, v8, 4
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vi v11, v8, 8
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV64ZVBC32-NEXT:    li a0, 32
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV64ZVBC32-NEXT:    li a0, 64
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV64ZVBC32-NEXT:    li a0, 128
-; RV64ZVBC32-NEXT:    vand.vx v8, v8, a0
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vmul.vv v8, v10, v8
-; RV64ZVBC32-NEXT:    vxor.vv v8, v9, v8
-; RV64ZVBC32-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
-; RV64ZVBC32-NEXT:    vnsrl.wi v8, v8, 8
+; RV64ZVBC32-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
+; RV64ZVBC32-NEXT:    vclmulh.vv v8, v8, v9
 ; RV64ZVBC32-NEXT:    ret
   %va.ext = zext <vscale x 4 x i8> %va to <vscale x 4 x i16>
   %vb.ext = zext <vscale x 4 x i8> %vb to <vscale x 4 x i16>
@@ -1046,77 +738,13 @@ define <vscale x 4 x i8> @clmulh_nxv4i8_vx(<vscale x 4 x i8> %va, i8 %b) nounwin
 ; RV32ZVBC32-LABEL: clmulh_nxv4i8_vx:
 ; RV32ZVBC32:       # %bb.0:
 ; RV32ZVBC32-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
-; RV32ZVBC32-NEXT:    vmv.v.x v9, a0
-; RV32ZVBC32-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; RV32ZVBC32-NEXT:    vzext.vf2 v10, v8
-; RV32ZVBC32-NEXT:    li a0, 16
-; RV32ZVBC32-NEXT:    vzext.vf2 v8, v9
-; RV32ZVBC32-NEXT:    vand.vi v9, v8, 2
-; RV32ZVBC32-NEXT:    vand.vi v11, v8, 1
-; RV32ZVBC32-NEXT:    vmul.vv v9, v10, v9
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v11, v9
-; RV32ZVBC32-NEXT:    vand.vi v11, v8, 4
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV32ZVBC32-NEXT:    vand.vi v11, v8, 8
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV32ZVBC32-NEXT:    li a0, 32
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV32ZVBC32-NEXT:    li a0, 64
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV32ZVBC32-NEXT:    li a0, 128
-; RV32ZVBC32-NEXT:    vand.vx v8, v8, a0
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
-; RV32ZVBC32-NEXT:    vxor.vv v8, v9, v8
-; RV32ZVBC32-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
-; RV32ZVBC32-NEXT:    vnsrl.wi v8, v8, 8
+; RV32ZVBC32-NEXT:    vclmulh.vx v8, v8, a0
 ; RV32ZVBC32-NEXT:    ret
 ;
 ; RV64ZVBC32-LABEL: clmulh_nxv4i8_vx:
 ; RV64ZVBC32:       # %bb.0:
 ; RV64ZVBC32-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
-; RV64ZVBC32-NEXT:    vmv.v.x v9, a0
-; RV64ZVBC32-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; RV64ZVBC32-NEXT:    vzext.vf2 v10, v8
-; RV64ZVBC32-NEXT:    li a0, 16
-; RV64ZVBC32-NEXT:    vzext.vf2 v8, v9
-; RV64ZVBC32-NEXT:    vand.vi v9, v8, 2
-; RV64ZVBC32-NEXT:    vand.vi v11, v8, 1
-; RV64ZVBC32-NEXT:    vmul.vv v9, v10, v9
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v11, v9
-; RV64ZVBC32-NEXT:    vand.vi v11, v8, 4
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vi v11, v8, 8
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV64ZVBC32-NEXT:    li a0, 32
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV64ZVBC32-NEXT:    li a0, 64
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV64ZVBC32-NEXT:    li a0, 128
-; RV64ZVBC32-NEXT:    vand.vx v8, v8, a0
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vmul.vv v8, v10, v8
-; RV64ZVBC32-NEXT:    vxor.vv v8, v9, v8
-; RV64ZVBC32-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
-; RV64ZVBC32-NEXT:    vnsrl.wi v8, v8, 8
+; RV64ZVBC32-NEXT:    vclmulh.vx v8, v8, a0
 ; RV64ZVBC32-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x i8> poison, i8 %b, i32 0
   %vb = shufflevector <vscale x 4 x i8> %elt.head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
@@ -1231,74 +859,14 @@ define <vscale x 8 x i8> @clmulh_nxv8i8_vv(<vscale x 8 x i8> %va, <vscale x 8 x
 ;
 ; RV32ZVBC32-LABEL: clmulh_nxv8i8_vv:
 ; RV32ZVBC32:       # %bb.0:
-; RV32ZVBC32-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; RV32ZVBC32-NEXT:    vzext.vf2 v10, v8
-; RV32ZVBC32-NEXT:    vzext.vf2 v12, v9
-; RV32ZVBC32-NEXT:    li a0, 16
-; RV32ZVBC32-NEXT:    vand.vi v8, v12, 2
-; RV32ZVBC32-NEXT:    vand.vi v14, v12, 1
-; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
-; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV32ZVBC32-NEXT:    vxor.vv v8, v14, v8
-; RV32ZVBC32-NEXT:    vand.vi v14, v12, 4
-; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV32ZVBC32-NEXT:    vand.vi v14, v12, 8
-; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v12, a0
-; RV32ZVBC32-NEXT:    li a0, 32
-; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v12, a0
-; RV32ZVBC32-NEXT:    li a0, 64
-; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v12, a0
-; RV32ZVBC32-NEXT:    li a0, 128
-; RV32ZVBC32-NEXT:    vand.vx v12, v12, a0
-; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV32ZVBC32-NEXT:    vmul.vv v10, v10, v12
-; RV32ZVBC32-NEXT:    vxor.vv v10, v8, v10
-; RV32ZVBC32-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; RV32ZVBC32-NEXT:    vnsrl.wi v8, v10, 8
+; RV32ZVBC32-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; RV32ZVBC32-NEXT:    vclmulh.vv v8, v8, v9
 ; RV32ZVBC32-NEXT:    ret
 ;
 ; RV64ZVBC32-LABEL: clmulh_nxv8i8_vv:
 ; RV64ZVBC32:       # %bb.0:
-; RV64ZVBC32-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; RV64ZVBC32-NEXT:    vzext.vf2 v10, v8
-; RV64ZVBC32-NEXT:    vzext.vf2 v12, v9
-; RV64ZVBC32-NEXT:    li a0, 16
-; RV64ZVBC32-NEXT:    vand.vi v8, v12, 2
-; RV64ZVBC32-NEXT:    vand.vi v14, v12, 1
-; RV64ZVBC32-NEXT:    vmul.vv v8, v10, v8
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v8, v14, v8
-; RV64ZVBC32-NEXT:    vand.vi v14, v12, 4
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV64ZVBC32-NEXT:    vand.vi v14, v12, 8
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v12, a0
-; RV64ZVBC32-NEXT:    li a0, 32
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v12, a0
-; RV64ZVBC32-NEXT:    li a0, 64
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v12, a0
-; RV64ZVBC32-NEXT:    li a0, 128
-; RV64ZVBC32-NEXT:    vand.vx v12, v12, a0
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV64ZVBC32-NEXT:    vmul.vv v10, v10, v12
-; RV64ZVBC32-NEXT:    vxor.vv v10, v8, v10
-; RV64ZVBC32-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; RV64ZVBC32-NEXT:    vnsrl.wi v8, v10, 8
+; RV64ZVBC32-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; RV64ZVBC32-NEXT:    vclmulh.vv v8, v8, v9
 ; RV64ZVBC32-NEXT:    ret
   %va.ext = zext <vscale x 8 x i8> %va to <vscale x 8 x i16>
   %vb.ext = zext <vscale x 8 x i8> %vb to <vscale x 8 x i16>
@@ -1420,77 +988,13 @@ define <vscale x 8 x i8> @clmulh_nxv8i8_vx(<vscale x 8 x i8> %va, i8 %b) nounwin
 ; RV32ZVBC32-LABEL: clmulh_nxv8i8_vx:
 ; RV32ZVBC32:       # %bb.0:
 ; RV32ZVBC32-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
-; RV32ZVBC32-NEXT:    vmv.v.x v12, a0
-; RV32ZVBC32-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV32ZVBC32-NEXT:    vzext.vf2 v10, v8
-; RV32ZVBC32-NEXT:    li a0, 16
-; RV32ZVBC32-NEXT:    vzext.vf2 v8, v12
-; RV32ZVBC32-NEXT:    vand.vi v12, v8, 2
-; RV32ZVBC32-NEXT:    vand.vi v14, v8, 1
-; RV32ZVBC32-NEXT:    vmul.vv v12, v10, v12
-; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v14, v12
-; RV32ZVBC32-NEXT:    vand.vi v14, v8, 4
-; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vi v14, v8, 8
-; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v8, a0
-; RV32ZVBC32-NEXT:    li a0, 32
-; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v8, a0
-; RV32ZVBC32-NEXT:    li a0, 64
-; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v8, a0
-; RV32ZVBC32-NEXT:    li a0, 128
-; RV32ZVBC32-NEXT:    vand.vx v8, v8, a0
-; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
-; RV32ZVBC32-NEXT:    vxor.vv v10, v12, v8
-; RV32ZVBC32-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; RV32ZVBC32-NEXT:    vnsrl.wi v8, v10, 8
+; RV32ZVBC32-NEXT:    vclmulh.vx v8, v8, a0
 ; RV32ZVBC32-NEXT:    ret
 ;
 ; RV64ZVBC32-LABEL: clmulh_nxv8i8_vx:
 ; RV64ZVBC32:       # %bb.0:
 ; RV64ZVBC32-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
-; RV64ZVBC32-NEXT:    vmv.v.x v12, a0
-; RV64ZVBC32-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV64ZVBC32-NEXT:    vzext.vf2 v10, v8
-; RV64ZVBC32-NEXT:    li a0, 16
-; RV64ZVBC32-NEXT:    vzext.vf2 v8, v12
-; RV64ZVBC32-NEXT:    vand.vi v12, v8, 2
-; RV64ZVBC32-NEXT:    vand.vi v14, v8, 1
-; RV64ZVBC32-NEXT:    vmul.vv v12, v10, v12
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v14, v12
-; RV64ZVBC32-NEXT:    vand.vi v14, v8, 4
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vi v14, v8, 8
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v8, a0
-; RV64ZVBC32-NEXT:    li a0, 32
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v8, a0
-; RV64ZVBC32-NEXT:    li a0, 64
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v8, a0
-; RV64ZVBC32-NEXT:    li a0, 128
-; RV64ZVBC32-NEXT:    vand.vx v8, v8, a0
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vmul.vv v8, v10, v8
-; RV64ZVBC32-NEXT:    vxor.vv v10, v12, v8
-; RV64ZVBC32-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; RV64ZVBC32-NEXT:    vnsrl.wi v8, v10, 8
+; RV64ZVBC32-NEXT:    vclmulh.vx v8, v8, a0
 ; RV64ZVBC32-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x i8> poison, i8 %b, i32 0
   %vb = shufflevector <vscale x 8 x i8> %elt.head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
@@ -1503,41 +1007,161 @@ define <vscale x 8 x i8> @clmulh_nxv8i8_vx(<vscale x 8 x i8> %va, i8 %b) nounwin
 }
 
 define <vscale x 16 x i8> @clmulh_nxv16i8_vv(<vscale x 16 x i8> %va, <vscale x 16 x i8> %vb) nounwind {
-; CHECK-LABEL: clmulh_nxv16i8_vv:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vzext.vf2 v12, v8
-; CHECK-NEXT:    vzext.vf2 v16, v10
-; CHECK-NEXT:    li a0, 16
-; CHECK-NEXT:    vand.vi v8, v16, 2
-; CHECK-NEXT:    vand.vi v20, v16, 1
-; CHECK-NEXT:    vmul.vv v8, v12, v8
-; CHECK-NEXT:    vmul.vv v20, v12, v20
-; CHECK-NEXT:    vxor.vv v8, v20, v8
-; CHECK-NEXT:    vand.vi v20, v16, 4
-; CHECK-NEXT:    vmul.vv v20, v12, v20
-; CHECK-NEXT:    vxor.vv v8, v8, v20
-; CHECK-NEXT:    vand.vi v20, v16, 8
-; CHECK-NEXT:    vmul.vv v20, v12, v20
-; CHECK-NEXT:    vxor.vv v8, v8, v20
-; CHECK-NEXT:    vand.vx v20, v16, a0
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vmul.vv v20, v12, v20
-; CHECK-NEXT:    vxor.vv v8, v8, v20
-; CHECK-NEXT:    vand.vx v20, v16, a0
-; CHECK-NEXT:    li a0, 64
-; CHECK-NEXT:    vmul.vv v20, v12, v20
-; CHECK-NEXT:    vxor.vv v8, v8, v20
-; CHECK-NEXT:    vand.vx v20, v16, a0
-; CHECK-NEXT:    li a0, 128
-; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vmul.vv v20, v12, v20
-; CHECK-NEXT:    vxor.vv v8, v8, v20
-; CHECK-NEXT:    vmul.vv v12, v12, v16
-; CHECK-NEXT:    vxor.vv v12, v8, v12
-; CHECK-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v12, 8
-; CHECK-NEXT:    ret
+; RV32V-LABEL: clmulh_nxv16i8_vv:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; RV32V-NEXT:    vzext.vf2 v12, v8
+; RV32V-NEXT:    vzext.vf2 v16, v10
+; RV32V-NEXT:    li a0, 16
+; RV32V-NEXT:    vand.vi v8, v16, 2
+; RV32V-NEXT:    vand.vi v20, v16, 1
+; RV32V-NEXT:    vmul.vv v8, v12, v8
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v8, v20, v8
+; RV32V-NEXT:    vand.vi v20, v16, 4
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v8, v8, v20
+; RV32V-NEXT:    vand.vi v20, v16, 8
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v8, v8, v20
+; RV32V-NEXT:    vand.vx v20, v16, a0
+; RV32V-NEXT:    li a0, 32
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v8, v8, v20
+; RV32V-NEXT:    vand.vx v20, v16, a0
+; RV32V-NEXT:    li a0, 64
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v8, v8, v20
+; RV32V-NEXT:    vand.vx v20, v16, a0
+; RV32V-NEXT:    li a0, 128
+; RV32V-NEXT:    vand.vx v16, v16, a0
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v8, v8, v20
+; RV32V-NEXT:    vmul.vv v12, v12, v16
+; RV32V-NEXT:    vxor.vv v12, v8, v12
+; RV32V-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
+; RV32V-NEXT:    vnsrl.wi v8, v12, 8
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: clmulh_nxv16i8_vv:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; RV64V-NEXT:    vzext.vf2 v12, v8
+; RV64V-NEXT:    vzext.vf2 v16, v10
+; RV64V-NEXT:    li a0, 16
+; RV64V-NEXT:    vand.vi v8, v16, 2
+; RV64V-NEXT:    vand.vi v20, v16, 1
+; RV64V-NEXT:    vmul.vv v8, v12, v8
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v20, v8
+; RV64V-NEXT:    vand.vi v20, v16, 4
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vand.vi v20, v16, 8
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vand.vx v20, v16, a0
+; RV64V-NEXT:    li a0, 32
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vand.vx v20, v16, a0
+; RV64V-NEXT:    li a0, 64
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vand.vx v20, v16, a0
+; RV64V-NEXT:    li a0, 128
+; RV64V-NEXT:    vand.vx v16, v16, a0
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vmul.vv v12, v12, v16
+; RV64V-NEXT:    vxor.vv v12, v8, v12
+; RV64V-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
+; RV64V-NEXT:    vnsrl.wi v8, v12, 8
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC64-LABEL: clmulh_nxv16i8_vv:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf2 v12, v8
+; RV32ZVBC64-NEXT:    vzext.vf2 v16, v10
+; RV32ZVBC64-NEXT:    li a0, 16
+; RV32ZVBC64-NEXT:    vand.vi v8, v16, 2
+; RV32ZVBC64-NEXT:    vand.vi v20, v16, 1
+; RV32ZVBC64-NEXT:    vmul.vv v8, v12, v8
+; RV32ZVBC64-NEXT:    vmul.vv v20, v12, v20
+; RV32ZVBC64-NEXT:    vxor.vv v8, v20, v8
+; RV32ZVBC64-NEXT:    vand.vi v20, v16, 4
+; RV32ZVBC64-NEXT:    vmul.vv v20, v12, v20
+; RV32ZVBC64-NEXT:    vxor.vv v8, v8, v20
+; RV32ZVBC64-NEXT:    vand.vi v20, v16, 8
+; RV32ZVBC64-NEXT:    vmul.vv v20, v12, v20
+; RV32ZVBC64-NEXT:    vxor.vv v8, v8, v20
+; RV32ZVBC64-NEXT:    vand.vx v20, v16, a0
+; RV32ZVBC64-NEXT:    li a0, 32
+; RV32ZVBC64-NEXT:    vmul.vv v20, v12, v20
+; RV32ZVBC64-NEXT:    vxor.vv v8, v8, v20
+; RV32ZVBC64-NEXT:    vand.vx v20, v16, a0
+; RV32ZVBC64-NEXT:    li a0, 64
+; RV32ZVBC64-NEXT:    vmul.vv v20, v12, v20
+; RV32ZVBC64-NEXT:    vxor.vv v8, v8, v20
+; RV32ZVBC64-NEXT:    vand.vx v20, v16, a0
+; RV32ZVBC64-NEXT:    li a0, 128
+; RV32ZVBC64-NEXT:    vand.vx v16, v16, a0
+; RV32ZVBC64-NEXT:    vmul.vv v20, v12, v20
+; RV32ZVBC64-NEXT:    vxor.vv v8, v8, v20
+; RV32ZVBC64-NEXT:    vmul.vv v12, v12, v16
+; RV32ZVBC64-NEXT:    vxor.vv v12, v8, v12
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v12, 8
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmulh_nxv16i8_vv:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf2 v12, v8
+; RV64ZVBC64-NEXT:    vzext.vf2 v16, v10
+; RV64ZVBC64-NEXT:    li a0, 16
+; RV64ZVBC64-NEXT:    vand.vi v8, v16, 2
+; RV64ZVBC64-NEXT:    vand.vi v20, v16, 1
+; RV64ZVBC64-NEXT:    vmul.vv v8, v12, v8
+; RV64ZVBC64-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC64-NEXT:    vxor.vv v8, v20, v8
+; RV64ZVBC64-NEXT:    vand.vi v20, v16, 4
+; RV64ZVBC64-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC64-NEXT:    vxor.vv v8, v8, v20
+; RV64ZVBC64-NEXT:    vand.vi v20, v16, 8
+; RV64ZVBC64-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC64-NEXT:    vxor.vv v8, v8, v20
+; RV64ZVBC64-NEXT:    vand.vx v20, v16, a0
+; RV64ZVBC64-NEXT:    li a0, 32
+; RV64ZVBC64-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC64-NEXT:    vxor.vv v8, v8, v20
+; RV64ZVBC64-NEXT:    vand.vx v20, v16, a0
+; RV64ZVBC64-NEXT:    li a0, 64
+; RV64ZVBC64-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC64-NEXT:    vxor.vv v8, v8, v20
+; RV64ZVBC64-NEXT:    vand.vx v20, v16, a0
+; RV64ZVBC64-NEXT:    li a0, 128
+; RV64ZVBC64-NEXT:    vand.vx v16, v16, a0
+; RV64ZVBC64-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC64-NEXT:    vxor.vv v8, v8, v20
+; RV64ZVBC64-NEXT:    vmul.vv v12, v12, v16
+; RV64ZVBC64-NEXT:    vxor.vv v12, v8, v12
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v12, 8
+; RV64ZVBC64-NEXT:    ret
+;
+; RV32ZVBC32-LABEL: clmulh_nxv16i8_vv:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; RV32ZVBC32-NEXT:    vclmulh.vv v8, v8, v10
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmulh_nxv16i8_vv:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; RV64ZVBC32-NEXT:    vclmulh.vv v8, v8, v10
+; RV64ZVBC32-NEXT:    ret
   %va.ext = zext <vscale x 16 x i8> %va to <vscale x 16 x i16>
   %vb.ext = zext <vscale x 16 x i8> %vb to <vscale x 16 x i16>
   %clmul = call <vscale x 16 x i16> @llvm.clmul.nxv16i16(<vscale x 16 x i16> %va.ext, <vscale x 16 x i16> %vb.ext)
@@ -1547,43 +1171,169 @@ define <vscale x 16 x i8> @clmulh_nxv16i8_vv(<vscale x 16 x i8> %va, <vscale x 1
 }
 
 define <vscale x 16 x i8> @clmulh_nxv16i8_vx(<vscale x 16 x i8> %va, i8 %b) nounwind {
-; CHECK-LABEL: clmulh_nxv16i8_vx:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmv.v.x v16, a0
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vzext.vf2 v12, v8
-; CHECK-NEXT:    li a0, 16
-; CHECK-NEXT:    vzext.vf2 v8, v16
-; CHECK-NEXT:    vand.vi v16, v8, 2
-; CHECK-NEXT:    vand.vi v20, v8, 1
-; CHECK-NEXT:    vmul.vv v16, v12, v16
-; CHECK-NEXT:    vmul.vv v20, v12, v20
-; CHECK-NEXT:    vxor.vv v16, v20, v16
-; CHECK-NEXT:    vand.vi v20, v8, 4
-; CHECK-NEXT:    vmul.vv v20, v12, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vi v20, v8, 8
-; CHECK-NEXT:    vmul.vv v20, v12, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vx v20, v8, a0
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vmul.vv v20, v12, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vx v20, v8, a0
-; CHECK-NEXT:    li a0, 64
-; CHECK-NEXT:    vmul.vv v20, v12, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vx v20, v8, a0
-; CHECK-NEXT:    li a0, 128
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    vmul.vv v20, v12, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vmul.vv v8, v12, v8
-; CHECK-NEXT:    vxor.vv v12, v16, v8
-; CHECK-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v12, 8
-; CHECK-NEXT:    ret
+; RV32V-LABEL: clmulh_nxv16i8_vx:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
+; RV32V-NEXT:    vmv.v.x v16, a0
+; RV32V-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; RV32V-NEXT:    vzext.vf2 v12, v8
+; RV32V-NEXT:    li a0, 16
+; RV32V-NEXT:    vzext.vf2 v8, v16
+; RV32V-NEXT:    vand.vi v16, v8, 2
+; RV32V-NEXT:    vand.vi v20, v8, 1
+; RV32V-NEXT:    vmul.vv v16, v12, v16
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v16, v20, v16
+; RV32V-NEXT:    vand.vi v20, v8, 4
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vi v20, v8, 8
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v8, a0
+; RV32V-NEXT:    li a0, 32
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v8, a0
+; RV32V-NEXT:    li a0, 64
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v8, a0
+; RV32V-NEXT:    li a0, 128
+; RV32V-NEXT:    vand.vx v8, v8, a0
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vmul.vv v8, v12, v8
+; RV32V-NEXT:    vxor.vv v12, v16, v8
+; RV32V-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
+; RV32V-NEXT:    vnsrl.wi v8, v12, 8
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: clmulh_nxv16i8_vx:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
+; RV64V-NEXT:    vmv.v.x v16, a0
+; RV64V-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; RV64V-NEXT:    vzext.vf2 v12, v8
+; RV64V-NEXT:    li a0, 16
+; RV64V-NEXT:    vzext.vf2 v8, v16
+; RV64V-NEXT:    vand.vi v16, v8, 2
+; RV64V-NEXT:    vand.vi v20, v8, 1
+; RV64V-NEXT:    vmul.vv v16, v12, v16
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v16, v20, v16
+; RV64V-NEXT:    vand.vi v20, v8, 4
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vi v20, v8, 8
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v8, a0
+; RV64V-NEXT:    li a0, 32
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v8, a0
+; RV64V-NEXT:    li a0, 64
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v8, a0
+; RV64V-NEXT:    li a0, 128
+; RV64V-NEXT:    vand.vx v8, v8, a0
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vmul.vv v8, v12, v8
+; RV64V-NEXT:    vxor.vv v12, v16, v8
+; RV64V-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
+; RV64V-NEXT:    vnsrl.wi v8, v12, 8
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC64-LABEL: clmulh_nxv16i8_vx:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
+; RV32ZVBC64-NEXT:    vmv.v.x v16, a0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf2 v12, v8
+; RV32ZVBC64-NEXT:    li a0, 16
+; RV32ZVBC64-NEXT:    vzext.vf2 v8, v16
+; RV32ZVBC64-NEXT:    vand.vi v16, v8, 2
+; RV32ZVBC64-NEXT:    vand.vi v20, v8, 1
+; RV32ZVBC64-NEXT:    vmul.vv v16, v12, v16
+; RV32ZVBC64-NEXT:    vmul.vv v20, v12, v20
+; RV32ZVBC64-NEXT:    vxor.vv v16, v20, v16
+; RV32ZVBC64-NEXT:    vand.vi v20, v8, 4
+; RV32ZVBC64-NEXT:    vmul.vv v20, v12, v20
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC64-NEXT:    vand.vi v20, v8, 8
+; RV32ZVBC64-NEXT:    vmul.vv v20, v12, v20
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC64-NEXT:    vand.vx v20, v8, a0
+; RV32ZVBC64-NEXT:    li a0, 32
+; RV32ZVBC64-NEXT:    vmul.vv v20, v12, v20
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC64-NEXT:    vand.vx v20, v8, a0
+; RV32ZVBC64-NEXT:    li a0, 64
+; RV32ZVBC64-NEXT:    vmul.vv v20, v12, v20
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC64-NEXT:    vand.vx v20, v8, a0
+; RV32ZVBC64-NEXT:    li a0, 128
+; RV32ZVBC64-NEXT:    vand.vx v8, v8, a0
+; RV32ZVBC64-NEXT:    vmul.vv v20, v12, v20
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV32ZVBC64-NEXT:    vmul.vv v8, v12, v8
+; RV32ZVBC64-NEXT:    vxor.vv v12, v16, v8
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v12, 8
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmulh_nxv16i8_vx:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
+; RV64ZVBC64-NEXT:    vmv.v.x v16, a0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf2 v12, v8
+; RV64ZVBC64-NEXT:    li a0, 16
+; RV64ZVBC64-NEXT:    vzext.vf2 v8, v16
+; RV64ZVBC64-NEXT:    vand.vi v16, v8, 2
+; RV64ZVBC64-NEXT:    vand.vi v20, v8, 1
+; RV64ZVBC64-NEXT:    vmul.vv v16, v12, v16
+; RV64ZVBC64-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC64-NEXT:    vxor.vv v16, v20, v16
+; RV64ZVBC64-NEXT:    vand.vi v20, v8, 4
+; RV64ZVBC64-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC64-NEXT:    vand.vi v20, v8, 8
+; RV64ZVBC64-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC64-NEXT:    vand.vx v20, v8, a0
+; RV64ZVBC64-NEXT:    li a0, 32
+; RV64ZVBC64-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC64-NEXT:    vand.vx v20, v8, a0
+; RV64ZVBC64-NEXT:    li a0, 64
+; RV64ZVBC64-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC64-NEXT:    vand.vx v20, v8, a0
+; RV64ZVBC64-NEXT:    li a0, 128
+; RV64ZVBC64-NEXT:    vand.vx v8, v8, a0
+; RV64ZVBC64-NEXT:    vmul.vv v20, v12, v20
+; RV64ZVBC64-NEXT:    vxor.vv v16, v16, v20
+; RV64ZVBC64-NEXT:    vmul.vv v8, v12, v8
+; RV64ZVBC64-NEXT:    vxor.vv v12, v16, v8
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v12, 8
+; RV64ZVBC64-NEXT:    ret
+;
+; RV32ZVBC32-LABEL: clmulh_nxv16i8_vx:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
+; RV32ZVBC32-NEXT:    vclmulh.vx v8, v8, a0
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmulh_nxv16i8_vx:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
+; RV64ZVBC32-NEXT:    vclmulh.vx v8, v8, a0
+; RV64ZVBC32-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x i8> poison, i8 %b, i32 0
   %vb = shufflevector <vscale x 16 x i8> %elt.head, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
   %va.ext = zext <vscale x 16 x i8> %va to <vscale x 16 x i16>
@@ -1595,41 +1345,161 @@ define <vscale x 16 x i8> @clmulh_nxv16i8_vx(<vscale x 16 x i8> %va, i8 %b) noun
 }
 
 define <vscale x 32 x i8> @clmulh_nxv32i8_vv(<vscale x 32 x i8> %va, <vscale x 32 x i8> %vb) nounwind {
-; CHECK-LABEL: clmulh_nxv32i8_vv:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vzext.vf2 v16, v8
-; CHECK-NEXT:    vzext.vf2 v24, v12
-; CHECK-NEXT:    li a0, 16
-; CHECK-NEXT:    vand.vi v8, v24, 2
-; CHECK-NEXT:    vand.vi v0, v24, 1
-; CHECK-NEXT:    vmul.vv v8, v16, v8
-; CHECK-NEXT:    vmul.vv v0, v16, v0
-; CHECK-NEXT:    vxor.vv v8, v0, v8
-; CHECK-NEXT:    vand.vi v0, v24, 4
-; CHECK-NEXT:    vmul.vv v0, v16, v0
-; CHECK-NEXT:    vxor.vv v8, v8, v0
-; CHECK-NEXT:    vand.vi v0, v24, 8
-; CHECK-NEXT:    vmul.vv v0, v16, v0
-; CHECK-NEXT:    vxor.vv v8, v8, v0
-; CHECK-NEXT:    vand.vx v0, v24, a0
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vmul.vv v0, v16, v0
-; CHECK-NEXT:    vxor.vv v8, v8, v0
-; CHECK-NEXT:    vand.vx v0, v24, a0
-; CHECK-NEXT:    li a0, 64
-; CHECK-NEXT:    vmul.vv v0, v16, v0
-; CHECK-NEXT:    vxor.vv v8, v8, v0
-; CHECK-NEXT:    vand.vx v0, v24, a0
-; CHECK-NEXT:    li a0, 128
-; CHECK-NEXT:    vand.vx v24, v24, a0
-; CHECK-NEXT:    vmul.vv v0, v16, v0
-; CHECK-NEXT:    vxor.vv v8, v8, v0
-; CHECK-NEXT:    vmul.vv v16, v16, v24
-; CHECK-NEXT:    vxor.vv v16, v8, v16
-; CHECK-NEXT:    vsetvli zero, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v16, 8
-; CHECK-NEXT:    ret
+; RV32V-LABEL: clmulh_nxv32i8_vv:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
+; RV32V-NEXT:    vzext.vf2 v16, v8
+; RV32V-NEXT:    vzext.vf2 v24, v12
+; RV32V-NEXT:    li a0, 16
+; RV32V-NEXT:    vand.vi v8, v24, 2
+; RV32V-NEXT:    vand.vi v0, v24, 1
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v0, v8
+; RV32V-NEXT:    vand.vi v0, v24, 4
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    vand.vi v0, v24, 8
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    vand.vx v0, v24, a0
+; RV32V-NEXT:    li a0, 32
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    vand.vx v0, v24, a0
+; RV32V-NEXT:    li a0, 64
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    vand.vx v0, v24, a0
+; RV32V-NEXT:    li a0, 128
+; RV32V-NEXT:    vand.vx v24, v24, a0
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    vmul.vv v16, v16, v24
+; RV32V-NEXT:    vxor.vv v16, v8, v16
+; RV32V-NEXT:    vsetvli zero, zero, e8, m4, ta, ma
+; RV32V-NEXT:    vnsrl.wi v8, v16, 8
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: clmulh_nxv32i8_vv:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
+; RV64V-NEXT:    vzext.vf2 v16, v8
+; RV64V-NEXT:    vzext.vf2 v24, v12
+; RV64V-NEXT:    li a0, 16
+; RV64V-NEXT:    vand.vi v8, v24, 2
+; RV64V-NEXT:    vand.vi v0, v24, 1
+; RV64V-NEXT:    vmul.vv v8, v16, v8
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v0, v8
+; RV64V-NEXT:    vand.vi v0, v24, 4
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vand.vi v0, v24, 8
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vand.vx v0, v24, a0
+; RV64V-NEXT:    li a0, 32
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vand.vx v0, v24, a0
+; RV64V-NEXT:    li a0, 64
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vand.vx v0, v24, a0
+; RV64V-NEXT:    li a0, 128
+; RV64V-NEXT:    vand.vx v24, v24, a0
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vmul.vv v16, v16, v24
+; RV64V-NEXT:    vxor.vv v16, v8, v16
+; RV64V-NEXT:    vsetvli zero, zero, e8, m4, ta, ma
+; RV64V-NEXT:    vnsrl.wi v8, v16, 8
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC64-LABEL: clmulh_nxv32i8_vv:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf2 v16, v8
+; RV32ZVBC64-NEXT:    vzext.vf2 v24, v12
+; RV32ZVBC64-NEXT:    li a0, 16
+; RV32ZVBC64-NEXT:    vand.vi v8, v24, 2
+; RV32ZVBC64-NEXT:    vand.vi v0, v24, 1
+; RV32ZVBC64-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC64-NEXT:    vxor.vv v8, v0, v8
+; RV32ZVBC64-NEXT:    vand.vi v0, v24, 4
+; RV32ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC64-NEXT:    vxor.vv v8, v8, v0
+; RV32ZVBC64-NEXT:    vand.vi v0, v24, 8
+; RV32ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC64-NEXT:    vxor.vv v8, v8, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v24, a0
+; RV32ZVBC64-NEXT:    li a0, 32
+; RV32ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC64-NEXT:    vxor.vv v8, v8, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v24, a0
+; RV32ZVBC64-NEXT:    li a0, 64
+; RV32ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC64-NEXT:    vxor.vv v8, v8, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v24, a0
+; RV32ZVBC64-NEXT:    li a0, 128
+; RV32ZVBC64-NEXT:    vand.vx v24, v24, a0
+; RV32ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC64-NEXT:    vxor.vv v8, v8, v0
+; RV32ZVBC64-NEXT:    vmul.vv v16, v16, v24
+; RV32ZVBC64-NEXT:    vxor.vv v16, v8, v16
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e8, m4, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v16, 8
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmulh_nxv32i8_vv:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf2 v16, v8
+; RV64ZVBC64-NEXT:    vzext.vf2 v24, v12
+; RV64ZVBC64-NEXT:    li a0, 16
+; RV64ZVBC64-NEXT:    vand.vi v8, v24, 2
+; RV64ZVBC64-NEXT:    vand.vi v0, v24, 1
+; RV64ZVBC64-NEXT:    vmul.vv v8, v16, v8
+; RV64ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC64-NEXT:    vxor.vv v8, v0, v8
+; RV64ZVBC64-NEXT:    vand.vi v0, v24, 4
+; RV64ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC64-NEXT:    vxor.vv v8, v8, v0
+; RV64ZVBC64-NEXT:    vand.vi v0, v24, 8
+; RV64ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC64-NEXT:    vxor.vv v8, v8, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v24, a0
+; RV64ZVBC64-NEXT:    li a0, 32
+; RV64ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC64-NEXT:    vxor.vv v8, v8, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v24, a0
+; RV64ZVBC64-NEXT:    li a0, 64
+; RV64ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC64-NEXT:    vxor.vv v8, v8, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v24, a0
+; RV64ZVBC64-NEXT:    li a0, 128
+; RV64ZVBC64-NEXT:    vand.vx v24, v24, a0
+; RV64ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC64-NEXT:    vxor.vv v8, v8, v0
+; RV64ZVBC64-NEXT:    vmul.vv v16, v16, v24
+; RV64ZVBC64-NEXT:    vxor.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e8, m4, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v16, 8
+; RV64ZVBC64-NEXT:    ret
+;
+; RV32ZVBC32-LABEL: clmulh_nxv32i8_vv:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
+; RV32ZVBC32-NEXT:    vclmulh.vv v8, v8, v12
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmulh_nxv32i8_vv:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
+; RV64ZVBC32-NEXT:    vclmulh.vv v8, v8, v12
+; RV64ZVBC32-NEXT:    ret
   %va.ext = zext <vscale x 32 x i8> %va to <vscale x 32 x i16>
   %vb.ext = zext <vscale x 32 x i8> %vb to <vscale x 32 x i16>
   %clmul = call <vscale x 32 x i16> @llvm.clmul.nxv32i16(<vscale x 32 x i16> %va.ext, <vscale x 32 x i16> %vb.ext)
@@ -1639,43 +1509,169 @@ define <vscale x 32 x i8> @clmulh_nxv32i8_vv(<vscale x 32 x i8> %va, <vscale x 3
 }
 
 define <vscale x 32 x i8> @clmulh_nxv32i8_vx(<vscale x 32 x i8> %va, i8 %b) nounwind {
-; CHECK-LABEL: clmulh_nxv32i8_vx:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vmv.v.x v24, a0
-; CHECK-NEXT:    vsetvli zero, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vzext.vf2 v16, v8
-; CHECK-NEXT:    li a0, 16
-; CHECK-NEXT:    vzext.vf2 v8, v24
-; CHECK-NEXT:    vand.vi v24, v8, 2
-; CHECK-NEXT:    vand.vi v0, v8, 1
-; CHECK-NEXT:    vmul.vv v24, v16, v24
-; CHECK-NEXT:    vmul.vv v0, v16, v0
-; CHECK-NEXT:    vxor.vv v24, v0, v24
-; CHECK-NEXT:    vand.vi v0, v8, 4
-; CHECK-NEXT:    vmul.vv v0, v16, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vi v0, v8, 8
-; CHECK-NEXT:    vmul.vv v0, v16, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v8, a0
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vmul.vv v0, v16, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v8, a0
-; CHECK-NEXT:    li a0, 64
-; CHECK-NEXT:    vmul.vv v0, v16, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v8, a0
-; CHECK-NEXT:    li a0, 128
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    vmul.vv v0, v16, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vmul.vv v8, v16, v8
-; CHECK-NEXT:    vxor.vv v16, v24, v8
-; CHECK-NEXT:    vsetvli zero, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v16, 8
-; CHECK-NEXT:    ret
+; RV32V-LABEL: clmulh_nxv32i8_vx:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; RV32V-NEXT:    vmv.v.x v24, a0
+; RV32V-NEXT:    vsetvli zero, zero, e16, m8, ta, ma
+; RV32V-NEXT:    vzext.vf2 v16, v8
+; RV32V-NEXT:    li a0, 16
+; RV32V-NEXT:    vzext.vf2 v8, v24
+; RV32V-NEXT:    vand.vi v24, v8, 2
+; RV32V-NEXT:    vand.vi v0, v8, 1
+; RV32V-NEXT:    vmul.vv v24, v16, v24
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v24, v0, v24
+; RV32V-NEXT:    vand.vi v0, v8, 4
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vi v0, v8, 8
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v8, a0
+; RV32V-NEXT:    li a0, 32
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v8, a0
+; RV32V-NEXT:    li a0, 64
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v8, a0
+; RV32V-NEXT:    li a0, 128
+; RV32V-NEXT:    vand.vx v8, v8, a0
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    vxor.vv v16, v24, v8
+; RV32V-NEXT:    vsetvli zero, zero, e8, m4, ta, ma
+; RV32V-NEXT:    vnsrl.wi v8, v16, 8
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: clmulh_nxv32i8_vx:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; RV64V-NEXT:    vmv.v.x v24, a0
+; RV64V-NEXT:    vsetvli zero, zero, e16, m8, ta, ma
+; RV64V-NEXT:    vzext.vf2 v16, v8
+; RV64V-NEXT:    li a0, 16
+; RV64V-NEXT:    vzext.vf2 v8, v24
+; RV64V-NEXT:    vand.vi v24, v8, 2
+; RV64V-NEXT:    vand.vi v0, v8, 1
+; RV64V-NEXT:    vmul.vv v24, v16, v24
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v24, v0, v24
+; RV64V-NEXT:    vand.vi v0, v8, 4
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vi v0, v8, 8
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v8, a0
+; RV64V-NEXT:    li a0, 32
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v8, a0
+; RV64V-NEXT:    li a0, 64
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v8, a0
+; RV64V-NEXT:    li a0, 128
+; RV64V-NEXT:    vand.vx v8, v8, a0
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vmul.vv v8, v16, v8
+; RV64V-NEXT:    vxor.vv v16, v24, v8
+; RV64V-NEXT:    vsetvli zero, zero, e8, m4, ta, ma
+; RV64V-NEXT:    vnsrl.wi v8, v16, 8
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC64-LABEL: clmulh_nxv32i8_vx:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; RV32ZVBC64-NEXT:    vmv.v.x v24, a0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e16, m8, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf2 v16, v8
+; RV32ZVBC64-NEXT:    li a0, 16
+; RV32ZVBC64-NEXT:    vzext.vf2 v8, v24
+; RV32ZVBC64-NEXT:    vand.vi v24, v8, 2
+; RV32ZVBC64-NEXT:    vand.vi v0, v8, 1
+; RV32ZVBC64-NEXT:    vmul.vv v24, v16, v24
+; RV32ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v0, v24
+; RV32ZVBC64-NEXT:    vand.vi v0, v8, 4
+; RV32ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vi v0, v8, 8
+; RV32ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v8, a0
+; RV32ZVBC64-NEXT:    li a0, 32
+; RV32ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v8, a0
+; RV32ZVBC64-NEXT:    li a0, 64
+; RV32ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v8, a0
+; RV32ZVBC64-NEXT:    li a0, 128
+; RV32ZVBC64-NEXT:    vand.vx v8, v8, a0
+; RV32ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC64-NEXT:    vxor.vv v16, v24, v8
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e8, m4, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v16, 8
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmulh_nxv32i8_vx:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; RV64ZVBC64-NEXT:    vmv.v.x v24, a0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e16, m8, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf2 v16, v8
+; RV64ZVBC64-NEXT:    li a0, 16
+; RV64ZVBC64-NEXT:    vzext.vf2 v8, v24
+; RV64ZVBC64-NEXT:    vand.vi v24, v8, 2
+; RV64ZVBC64-NEXT:    vand.vi v0, v8, 1
+; RV64ZVBC64-NEXT:    vmul.vv v24, v16, v24
+; RV64ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v0, v24
+; RV64ZVBC64-NEXT:    vand.vi v0, v8, 4
+; RV64ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vi v0, v8, 8
+; RV64ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v8, a0
+; RV64ZVBC64-NEXT:    li a0, 32
+; RV64ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v8, a0
+; RV64ZVBC64-NEXT:    li a0, 64
+; RV64ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v8, a0
+; RV64ZVBC64-NEXT:    li a0, 128
+; RV64ZVBC64-NEXT:    vand.vx v8, v8, a0
+; RV64ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vmul.vv v8, v16, v8
+; RV64ZVBC64-NEXT:    vxor.vv v16, v24, v8
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e8, m4, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v16, 8
+; RV64ZVBC64-NEXT:    ret
+;
+; RV32ZVBC32-LABEL: clmulh_nxv32i8_vx:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; RV32ZVBC32-NEXT:    vclmulh.vx v8, v8, a0
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmulh_nxv32i8_vx:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; RV64ZVBC32-NEXT:    vclmulh.vx v8, v8, a0
+; RV64ZVBC32-NEXT:    ret
   %elt.head = insertelement <vscale x 32 x i8> poison, i8 %b, i32 0
   %vb = shufflevector <vscale x 32 x i8> %elt.head, <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer
   %va.ext = zext <vscale x 32 x i8> %va to <vscale x 32 x i16>
@@ -1687,922 +1683,2275 @@ define <vscale x 32 x i8> @clmulh_nxv32i8_vx(<vscale x 32 x i8> %va, i8 %b) noun
 }
 
 define <vscale x 64 x i8> @clmulh_nxv64i8_vv(<vscale x 64 x i8> %va, <vscale x 64 x i8> %vb) nounwind {
-; CHECK-LABEL: clmulh_nxv64i8_vv:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    mv a1, a0
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    add a1, a1, a0
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    sub sp, sp, a0
-; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
-; CHECK-NEXT:    vsll.vi v24, v8, 4
-; CHECK-NEXT:    vsrl.vi v8, v8, 4
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    li a1, 85
-; CHECK-NEXT:    vsll.vi v0, v16, 4
-; CHECK-NEXT:    vsrl.vi v16, v16, 4
-; CHECK-NEXT:    li a2, 16
-; CHECK-NEXT:    li a3, 32
-; CHECK-NEXT:    vor.vv v16, v16, v0
-; CHECK-NEXT:    vsrl.vi v0, v16, 2
-; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vand.vx v0, v0, a0
-; CHECK-NEXT:    vsll.vi v16, v16, 2
-; CHECK-NEXT:    vor.vv v16, v0, v16
-; CHECK-NEXT:    vsrl.vi v0, v16, 1
-; CHECK-NEXT:    vand.vx v16, v16, a1
-; CHECK-NEXT:    vand.vx v0, v0, a1
-; CHECK-NEXT:    vadd.vv v16, v16, v16
-; CHECK-NEXT:    vor.vv v0, v0, v16
-; CHECK-NEXT:    vand.vx v16, v0, a2
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 4
-; CHECK-NEXT:    mv a4, a2
-; CHECK-NEXT:    slli a2, a2, 1
-; CHECK-NEXT:    add a2, a2, a4
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    li a2, 64
-; CHECK-NEXT:    vor.vv v8, v8, v24
-; CHECK-NEXT:    vsrl.vi v16, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vsll.vi v8, v8, 2
-; CHECK-NEXT:    vor.vv v8, v16, v8
-; CHECK-NEXT:    vsrl.vi v16, v8, 1
-; CHECK-NEXT:    vand.vx v8, v8, a1
-; CHECK-NEXT:    vand.vx v16, v16, a1
-; CHECK-NEXT:    vadd.vv v8, v8, v8
-; CHECK-NEXT:    vor.vv v8, v16, v8
-; CHECK-NEXT:    vand.vx v16, v0, a3
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 2
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    li a3, 128
-; CHECK-NEXT:    vand.vx v16, v0, a2
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 5
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vand.vi v16, v0, 2
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    mv a4, a2
-; CHECK-NEXT:    slli a2, a2, 1
-; CHECK-NEXT:    add a2, a2, a4
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vand.vi v24, v0, 1
-; CHECK-NEXT:    vand.vi v16, v0, 4
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vand.vi v16, v0, 8
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vand.vx v0, v0, a3
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 4
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v0, (a2) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    mv a3, a2
-; CHECK-NEXT:    slli a2, a2, 1
-; CHECK-NEXT:    add a2, a2, a3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vmul.vv v16, v8, v16
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    mv a3, a2
-; CHECK-NEXT:    slli a2, a2, 1
-; CHECK-NEXT:    add a2, a2, a3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vmul.vv v24, v8, v24
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vmul.vv v16, v8, v16
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vmul.vv v0, v8, v16
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 4
-; CHECK-NEXT:    mv a3, a2
-; CHECK-NEXT:    slli a2, a2, 1
-; CHECK-NEXT:    add a2, a2, a3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vmul.vv v16, v8, v16
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 4
-; CHECK-NEXT:    mv a3, a2
-; CHECK-NEXT:    slli a2, a2, 1
-; CHECK-NEXT:    add a2, a2, a3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    mv a3, a2
-; CHECK-NEXT:    slli a2, a2, 2
-; CHECK-NEXT:    add a2, a2, a3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vmul.vv v16, v8, v16
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    mv a3, a2
-; CHECK-NEXT:    slli a2, a2, 2
-; CHECK-NEXT:    add a2, a2, a3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 5
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vmul.vv v16, v8, v16
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 5
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 4
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vmul.vv v8, v8, v16
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    mv a3, a2
-; CHECK-NEXT:    slli a2, a2, 1
-; CHECK-NEXT:    add a2, a2, a3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vxor.vv v24, v24, v16
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vxor.vv v16, v24, v16
-; CHECK-NEXT:    vxor.vv v16, v16, v0
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 4
-; CHECK-NEXT:    mv a3, a2
-; CHECK-NEXT:    slli a2, a2, 1
-; CHECK-NEXT:    add a2, a2, a3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vxor.vv v24, v16, v24
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    mv a3, a2
-; CHECK-NEXT:    slli a2, a2, 2
-; CHECK-NEXT:    add a2, a2, a3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v0, (a2) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 5
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v0, (a2) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vxor.vv v8, v24, v8
-; CHECK-NEXT:    vsll.vi v16, v16, 4
-; CHECK-NEXT:    vsrl.vi v8, v8, 4
-; CHECK-NEXT:    vor.vv v8, v8, v16
-; CHECK-NEXT:    vsrl.vi v16, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vsll.vi v8, v8, 2
-; CHECK-NEXT:    vor.vv v8, v16, v8
-; CHECK-NEXT:    vsrl.vi v16, v8, 1
-; CHECK-NEXT:    vand.vx v8, v8, a1
-; CHECK-NEXT:    vand.vx v16, v16, a1
-; CHECK-NEXT:    vadd.vv v8, v8, v8
-; CHECK-NEXT:    vor.vv v8, v16, v8
-; CHECK-NEXT:    vsrl.vi v8, v8, 1
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    mv a1, a0
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    add a1, a1, a0
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-  %va.ext = zext <vscale x 64 x i8> %va to <vscale x 64 x i16>
-  %vb.ext = zext <vscale x 64 x i8> %vb to <vscale x 64 x i16>
-  %clmul = call <vscale x 64 x i16> @llvm.clmul.nxv64i16(<vscale x 64 x i16> %va.ext, <vscale x 64 x i16> %vb.ext)
-  %res.ext = lshr <vscale x 64 x i16> %clmul, splat(i16 8)
-  %res = trunc <vscale x 64 x i16> %res.ext to <vscale x 64 x i8>
-  ret <vscale x 64 x i8> %res
-}
-
-define <vscale x 64 x i8> @clmulh_nxv64i8_vx(<vscale x 64 x i8> %va, i8 %b) nounwind {
-; CHECK-LABEL: clmulh_nxv64i8_vx:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    mv a2, a1
-; CHECK-NEXT:    slli a1, a1, 1
-; CHECK-NEXT:    add a2, a2, a1
-; CHECK-NEXT:    slli a1, a1, 1
-; CHECK-NEXT:    add a1, a1, a2
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
-; CHECK-NEXT:    vmv.v.x v24, a0
-; CHECK-NEXT:    vsll.vi v16, v8, 4
-; CHECK-NEXT:    vsrl.vi v8, v8, 4
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    li a1, 85
-; CHECK-NEXT:    li a2, 16
-; CHECK-NEXT:    li a3, 32
-; CHECK-NEXT:    vsll.vi v0, v24, 4
-; CHECK-NEXT:    vsrl.vi v24, v24, 4
-; CHECK-NEXT:    vor.vv v24, v24, v0
-; CHECK-NEXT:    vsrl.vi v0, v24, 2
-; CHECK-NEXT:    vand.vx v24, v24, a0
-; CHECK-NEXT:    vand.vx v0, v0, a0
-; CHECK-NEXT:    vsll.vi v24, v24, 2
-; CHECK-NEXT:    vor.vv v24, v0, v24
-; CHECK-NEXT:    vsrl.vi v0, v24, 1
-; CHECK-NEXT:    vand.vx v24, v24, a1
-; CHECK-NEXT:    vand.vx v0, v0, a1
-; CHECK-NEXT:    vadd.vv v24, v24, v24
-; CHECK-NEXT:    vor.vv v0, v0, v24
-; CHECK-NEXT:    vand.vx v24, v0, a2
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 4
-; CHECK-NEXT:    mv a4, a2
-; CHECK-NEXT:    slli a2, a2, 1
-; CHECK-NEXT:    add a2, a2, a4
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    li a2, 64
-; CHECK-NEXT:    vor.vv v8, v8, v16
-; CHECK-NEXT:    vsrl.vi v16, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vsll.vi v8, v8, 2
-; CHECK-NEXT:    vor.vv v8, v16, v8
-; CHECK-NEXT:    vsrl.vi v16, v8, 1
-; CHECK-NEXT:    vand.vx v8, v8, a1
-; CHECK-NEXT:    vand.vx v16, v16, a1
-; CHECK-NEXT:    vadd.vv v8, v8, v8
-; CHECK-NEXT:    vor.vv v8, v16, v8
-; CHECK-NEXT:    vand.vx v16, v0, a3
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 2
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    li a3, 128
-; CHECK-NEXT:    vand.vx v16, v0, a2
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 5
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vand.vi v16, v0, 2
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    mv a4, a2
-; CHECK-NEXT:    slli a2, a2, 1
-; CHECK-NEXT:    add a2, a2, a4
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vand.vi v24, v0, 1
-; CHECK-NEXT:    vand.vi v16, v0, 4
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vand.vi v16, v0, 8
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vand.vx v0, v0, a3
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 4
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v0, (a2) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    mv a3, a2
-; CHECK-NEXT:    slli a2, a2, 1
-; CHECK-NEXT:    add a2, a2, a3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vmul.vv v16, v8, v16
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    mv a3, a2
-; CHECK-NEXT:    slli a2, a2, 1
-; CHECK-NEXT:    add a2, a2, a3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vmul.vv v24, v8, v24
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vmul.vv v16, v8, v16
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vmul.vv v0, v8, v16
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 4
-; CHECK-NEXT:    mv a3, a2
-; CHECK-NEXT:    slli a2, a2, 1
-; CHECK-NEXT:    add a2, a2, a3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vmul.vv v16, v8, v16
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 4
-; CHECK-NEXT:    mv a3, a2
-; CHECK-NEXT:    slli a2, a2, 1
-; CHECK-NEXT:    add a2, a2, a3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    mv a3, a2
-; CHECK-NEXT:    slli a2, a2, 2
-; CHECK-NEXT:    add a2, a2, a3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vmul.vv v16, v8, v16
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    mv a3, a2
-; CHECK-NEXT:    slli a2, a2, 2
-; CHECK-NEXT:    add a2, a2, a3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 5
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vmul.vv v16, v8, v16
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 5
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 4
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vmul.vv v8, v8, v16
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    mv a3, a2
-; CHECK-NEXT:    slli a2, a2, 1
-; CHECK-NEXT:    add a2, a2, a3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vxor.vv v24, v24, v16
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vxor.vv v16, v24, v16
-; CHECK-NEXT:    vxor.vv v16, v16, v0
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 4
-; CHECK-NEXT:    mv a3, a2
-; CHECK-NEXT:    slli a2, a2, 1
-; CHECK-NEXT:    add a2, a2, a3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vxor.vv v24, v16, v24
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    mv a3, a2
-; CHECK-NEXT:    slli a2, a2, 2
-; CHECK-NEXT:    add a2, a2, a3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v0, (a2) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 5
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v0, (a2) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vxor.vv v8, v24, v8
-; CHECK-NEXT:    vsll.vi v16, v16, 4
-; CHECK-NEXT:    vsrl.vi v8, v8, 4
-; CHECK-NEXT:    vor.vv v8, v8, v16
-; CHECK-NEXT:    vsrl.vi v16, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vsll.vi v8, v8, 2
-; CHECK-NEXT:    vor.vv v8, v16, v8
-; CHECK-NEXT:    vsrl.vi v16, v8, 1
-; CHECK-NEXT:    vand.vx v8, v8, a1
-; CHECK-NEXT:    vand.vx v16, v16, a1
-; CHECK-NEXT:    vadd.vv v8, v8, v8
-; CHECK-NEXT:    vor.vv v8, v16, v8
-; CHECK-NEXT:    vsrl.vi v8, v8, 1
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    mv a1, a0
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    add a1, a1, a0
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-  %elt.head = insertelement <vscale x 64 x i8> poison, i8 %b, i32 0
-  %vb = shufflevector <vscale x 64 x i8> %elt.head, <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer
-  %va.ext = zext <vscale x 64 x i8> %va to <vscale x 64 x i16>
-  %vb.ext = zext <vscale x 64 x i8> %vb to <vscale x 64 x i16>
-  %clmul = call <vscale x 64 x i16> @llvm.clmul.nxv64i16(<vscale x 64 x i16> %va.ext, <vscale x 64 x i16> %vb.ext)
-  %res.ext = lshr <vscale x 64 x i16> %clmul, splat(i16 8)
-  %res = trunc <vscale x 64 x i16> %res.ext to <vscale x 64 x i8>
-  ret <vscale x 64 x i8> %res
-}
-
-define <vscale x 1 x i16> @clmulh_nxv1i16_vv(<vscale x 1 x i16> %va, <vscale x 1 x i16> %vb) nounwind {
-; RV32V-LABEL: clmulh_nxv1i16_vv:
+; RV32V-LABEL: clmulh_nxv64i8_vv:
 ; RV32V:       # %bb.0:
-; RV32V-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
-; RV32V-NEXT:    vzext.vf2 v10, v8
-; RV32V-NEXT:    vzext.vf2 v8, v9
-; RV32V-NEXT:    li a0, 16
-; RV32V-NEXT:    vand.vi v9, v8, 2
-; RV32V-NEXT:    vand.vi v11, v8, 1
-; RV32V-NEXT:    vmul.vv v9, v10, v9
-; RV32V-NEXT:    vmul.vv v11, v10, v11
-; RV32V-NEXT:    vxor.vv v9, v11, v9
-; RV32V-NEXT:    vand.vi v11, v8, 4
-; RV32V-NEXT:    vmul.vv v11, v10, v11
-; RV32V-NEXT:    vxor.vv v9, v9, v11
-; RV32V-NEXT:    vand.vi v11, v8, 8
-; RV32V-NEXT:    vmul.vv v11, v10, v11
-; RV32V-NEXT:    vxor.vv v9, v9, v11
-; RV32V-NEXT:    vand.vx v11, v8, a0
-; RV32V-NEXT:    li a0, 32
-; RV32V-NEXT:    vmul.vv v11, v10, v11
-; RV32V-NEXT:    vxor.vv v9, v9, v11
-; RV32V-NEXT:    vand.vx v11, v8, a0
-; RV32V-NEXT:    li a0, 64
-; RV32V-NEXT:    vmul.vv v11, v10, v11
-; RV32V-NEXT:    vxor.vv v9, v9, v11
-; RV32V-NEXT:    vand.vx v11, v8, a0
-; RV32V-NEXT:    li a0, 128
-; RV32V-NEXT:    vmul.vv v11, v10, v11
-; RV32V-NEXT:    vxor.vv v9, v9, v11
-; RV32V-NEXT:    vand.vx v11, v8, a0
-; RV32V-NEXT:    li a0, 256
-; RV32V-NEXT:    vmul.vv v11, v10, v11
-; RV32V-NEXT:    vxor.vv v9, v9, v11
-; RV32V-NEXT:    vand.vx v11, v8, a0
-; RV32V-NEXT:    li a0, 512
-; RV32V-NEXT:    vmul.vv v11, v10, v11
-; RV32V-NEXT:    vxor.vv v9, v9, v11
-; RV32V-NEXT:    vand.vx v11, v8, a0
-; RV32V-NEXT:    li a0, 1024
-; RV32V-NEXT:    vmul.vv v11, v10, v11
-; RV32V-NEXT:    vxor.vv v9, v9, v11
-; RV32V-NEXT:    vand.vx v11, v8, a0
-; RV32V-NEXT:    li a0, 1
-; RV32V-NEXT:    slli a0, a0, 11
-; RV32V-NEXT:    vmul.vv v11, v10, v11
-; RV32V-NEXT:    vxor.vv v9, v9, v11
-; RV32V-NEXT:    vand.vx v11, v8, a0
-; RV32V-NEXT:    lui a0, 1
-; RV32V-NEXT:    vmul.vv v11, v10, v11
-; RV32V-NEXT:    vxor.vv v9, v9, v11
-; RV32V-NEXT:    vand.vx v11, v8, a0
-; RV32V-NEXT:    lui a0, 2
-; RV32V-NEXT:    vmul.vv v11, v10, v11
-; RV32V-NEXT:    vxor.vv v9, v9, v11
-; RV32V-NEXT:    vand.vx v11, v8, a0
-; RV32V-NEXT:    lui a0, 4
-; RV32V-NEXT:    vmul.vv v11, v10, v11
-; RV32V-NEXT:    vxor.vv v9, v9, v11
-; RV32V-NEXT:    vand.vx v11, v8, a0
-; RV32V-NEXT:    lui a0, 8
+; RV32V-NEXT:    addi sp, sp, -16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    sub sp, sp, a0
+; RV32V-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
+; RV32V-NEXT:    vsll.vi v24, v8, 4
+; RV32V-NEXT:    vsrl.vi v8, v8, 4
+; RV32V-NEXT:    li a0, 51
+; RV32V-NEXT:    li a1, 85
+; RV32V-NEXT:    vsll.vi v0, v16, 4
+; RV32V-NEXT:    vsrl.vi v16, v16, 4
+; RV32V-NEXT:    li a2, 16
+; RV32V-NEXT:    li a3, 32
+; RV32V-NEXT:    vor.vv v16, v16, v0
+; RV32V-NEXT:    vsrl.vi v0, v16, 2
+; RV32V-NEXT:    vand.vx v16, v16, a0
+; RV32V-NEXT:    vand.vx v0, v0, a0
+; RV32V-NEXT:    vsll.vi v16, v16, 2
+; RV32V-NEXT:    vor.vv v16, v0, v16
+; RV32V-NEXT:    vsrl.vi v0, v16, 1
+; RV32V-NEXT:    vand.vx v16, v16, a1
+; RV32V-NEXT:    vand.vx v0, v0, a1
+; RV32V-NEXT:    vadd.vv v16, v16, v16
+; RV32V-NEXT:    vor.vv v0, v0, v16
+; RV32V-NEXT:    vand.vx v16, v0, a2
+; RV32V-NEXT:    csrr a2, vlenb
+; RV32V-NEXT:    slli a2, a2, 4
+; RV32V-NEXT:    mv a4, a2
+; RV32V-NEXT:    slli a2, a2, 1
+; RV32V-NEXT:    add a2, a2, a4
+; RV32V-NEXT:    add a2, sp, a2
+; RV32V-NEXT:    addi a2, a2, 16
+; RV32V-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    li a2, 64
+; RV32V-NEXT:    vor.vv v8, v8, v24
+; RV32V-NEXT:    vsrl.vi v16, v8, 2
 ; RV32V-NEXT:    vand.vx v8, v8, a0
-; RV32V-NEXT:    vmul.vv v11, v10, v11
-; RV32V-NEXT:    vxor.vv v9, v9, v11
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    vxor.vv v8, v9, v8
-; RV32V-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; RV32V-NEXT:    vnsrl.wi v8, v8, 16
+; RV32V-NEXT:    vand.vx v16, v16, a0
+; RV32V-NEXT:    vsll.vi v8, v8, 2
+; RV32V-NEXT:    vor.vv v8, v16, v8
+; RV32V-NEXT:    vsrl.vi v16, v8, 1
+; RV32V-NEXT:    vand.vx v8, v8, a1
+; RV32V-NEXT:    vand.vx v16, v16, a1
+; RV32V-NEXT:    vadd.vv v8, v8, v8
+; RV32V-NEXT:    vor.vv v8, v16, v8
+; RV32V-NEXT:    vand.vx v16, v0, a3
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 3
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 2
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    li a3, 128
+; RV32V-NEXT:    vand.vx v16, v0, a2
+; RV32V-NEXT:    csrr a2, vlenb
+; RV32V-NEXT:    slli a2, a2, 5
+; RV32V-NEXT:    add a2, sp, a2
+; RV32V-NEXT:    addi a2, a2, 16
+; RV32V-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vi v16, v0, 2
+; RV32V-NEXT:    csrr a2, vlenb
+; RV32V-NEXT:    slli a2, a2, 3
+; RV32V-NEXT:    mv a4, a2
+; RV32V-NEXT:    slli a2, a2, 1
+; RV32V-NEXT:    add a2, a2, a4
+; RV32V-NEXT:    add a2, sp, a2
+; RV32V-NEXT:    addi a2, a2, 16
+; RV32V-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vi v24, v0, 1
+; RV32V-NEXT:    vand.vi v16, v0, 4
+; RV32V-NEXT:    csrr a2, vlenb
+; RV32V-NEXT:    slli a2, a2, 3
+; RV32V-NEXT:    add a2, sp, a2
+; RV32V-NEXT:    addi a2, a2, 16
+; RV32V-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vi v16, v0, 8
+; RV32V-NEXT:    addi a2, sp, 16
+; RV32V-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v0, v0, a3
+; RV32V-NEXT:    csrr a2, vlenb
+; RV32V-NEXT:    slli a2, a2, 4
+; RV32V-NEXT:    add a2, sp, a2
+; RV32V-NEXT:    addi a2, a2, 16
+; RV32V-NEXT:    vs8r.v v0, (a2) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a2, vlenb
+; RV32V-NEXT:    slli a2, a2, 3
+; RV32V-NEXT:    mv a3, a2
+; RV32V-NEXT:    slli a2, a2, 1
+; RV32V-NEXT:    add a2, a2, a3
+; RV32V-NEXT:    add a2, sp, a2
+; RV32V-NEXT:    addi a2, a2, 16
+; RV32V-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    csrr a2, vlenb
+; RV32V-NEXT:    slli a2, a2, 3
+; RV32V-NEXT:    mv a3, a2
+; RV32V-NEXT:    slli a2, a2, 1
+; RV32V-NEXT:    add a2, a2, a3
+; RV32V-NEXT:    add a2, sp, a2
+; RV32V-NEXT:    addi a2, a2, 16
+; RV32V-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vmul.vv v24, v8, v24
+; RV32V-NEXT:    csrr a2, vlenb
+; RV32V-NEXT:    slli a2, a2, 3
+; RV32V-NEXT:    add a2, sp, a2
+; RV32V-NEXT:    addi a2, a2, 16
+; RV32V-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    csrr a2, vlenb
+; RV32V-NEXT:    slli a2, a2, 3
+; RV32V-NEXT:    add a2, sp, a2
+; RV32V-NEXT:    addi a2, a2, 16
+; RV32V-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    addi a2, sp, 16
+; RV32V-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v0, v8, v16
+; RV32V-NEXT:    csrr a2, vlenb
+; RV32V-NEXT:    slli a2, a2, 4
+; RV32V-NEXT:    mv a3, a2
+; RV32V-NEXT:    slli a2, a2, 1
+; RV32V-NEXT:    add a2, a2, a3
+; RV32V-NEXT:    add a2, sp, a2
+; RV32V-NEXT:    addi a2, a2, 16
+; RV32V-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    csrr a2, vlenb
+; RV32V-NEXT:    slli a2, a2, 4
+; RV32V-NEXT:    mv a3, a2
+; RV32V-NEXT:    slli a2, a2, 1
+; RV32V-NEXT:    add a2, a2, a3
+; RV32V-NEXT:    add a2, sp, a2
+; RV32V-NEXT:    addi a2, a2, 16
+; RV32V-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a2, vlenb
+; RV32V-NEXT:    slli a2, a2, 3
+; RV32V-NEXT:    mv a3, a2
+; RV32V-NEXT:    slli a2, a2, 2
+; RV32V-NEXT:    add a2, a2, a3
+; RV32V-NEXT:    add a2, sp, a2
+; RV32V-NEXT:    addi a2, a2, 16
+; RV32V-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    csrr a2, vlenb
+; RV32V-NEXT:    slli a2, a2, 3
+; RV32V-NEXT:    mv a3, a2
+; RV32V-NEXT:    slli a2, a2, 2
+; RV32V-NEXT:    add a2, a2, a3
+; RV32V-NEXT:    add a2, sp, a2
+; RV32V-NEXT:    addi a2, a2, 16
+; RV32V-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a2, vlenb
+; RV32V-NEXT:    slli a2, a2, 5
+; RV32V-NEXT:    add a2, sp, a2
+; RV32V-NEXT:    addi a2, a2, 16
+; RV32V-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    csrr a2, vlenb
+; RV32V-NEXT:    slli a2, a2, 5
+; RV32V-NEXT:    add a2, sp, a2
+; RV32V-NEXT:    addi a2, a2, 16
+; RV32V-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a2, vlenb
+; RV32V-NEXT:    slli a2, a2, 4
+; RV32V-NEXT:    add a2, sp, a2
+; RV32V-NEXT:    addi a2, a2, 16
+; RV32V-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v8, v16
+; RV32V-NEXT:    csrr a2, vlenb
+; RV32V-NEXT:    slli a2, a2, 3
+; RV32V-NEXT:    mv a3, a2
+; RV32V-NEXT:    slli a2, a2, 1
+; RV32V-NEXT:    add a2, a2, a3
+; RV32V-NEXT:    add a2, sp, a2
+; RV32V-NEXT:    addi a2, a2, 16
+; RV32V-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v24, v24, v16
+; RV32V-NEXT:    csrr a2, vlenb
+; RV32V-NEXT:    slli a2, a2, 3
+; RV32V-NEXT:    add a2, sp, a2
+; RV32V-NEXT:    addi a2, a2, 16
+; RV32V-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v24, v16
+; RV32V-NEXT:    vxor.vv v16, v16, v0
+; RV32V-NEXT:    csrr a2, vlenb
+; RV32V-NEXT:    slli a2, a2, 4
+; RV32V-NEXT:    mv a3, a2
+; RV32V-NEXT:    slli a2, a2, 1
+; RV32V-NEXT:    add a2, a2, a3
+; RV32V-NEXT:    add a2, sp, a2
+; RV32V-NEXT:    addi a2, a2, 16
+; RV32V-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v24, v16, v24
+; RV32V-NEXT:    csrr a2, vlenb
+; RV32V-NEXT:    slli a2, a2, 3
+; RV32V-NEXT:    mv a3, a2
+; RV32V-NEXT:    slli a2, a2, 2
+; RV32V-NEXT:    add a2, a2, a3
+; RV32V-NEXT:    add a2, sp, a2
+; RV32V-NEXT:    addi a2, a2, 16
+; RV32V-NEXT:    vl8r.v v0, (a2) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    csrr a2, vlenb
+; RV32V-NEXT:    slli a2, a2, 5
+; RV32V-NEXT:    add a2, sp, a2
+; RV32V-NEXT:    addi a2, a2, 16
+; RV32V-NEXT:    vl8r.v v0, (a2) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vxor.vv v8, v24, v8
+; RV32V-NEXT:    vsll.vi v16, v16, 4
+; RV32V-NEXT:    vsrl.vi v8, v8, 4
+; RV32V-NEXT:    vor.vv v8, v8, v16
+; RV32V-NEXT:    vsrl.vi v16, v8, 2
+; RV32V-NEXT:    vand.vx v8, v8, a0
+; RV32V-NEXT:    vand.vx v16, v16, a0
+; RV32V-NEXT:    vsll.vi v8, v8, 2
+; RV32V-NEXT:    vor.vv v8, v16, v8
+; RV32V-NEXT:    vsrl.vi v16, v8, 1
+; RV32V-NEXT:    vand.vx v8, v8, a1
+; RV32V-NEXT:    vand.vx v16, v16, a1
+; RV32V-NEXT:    vadd.vv v8, v8, v8
+; RV32V-NEXT:    vor.vv v8, v16, v8
+; RV32V-NEXT:    vsrl.vi v8, v8, 1
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add sp, sp, a0
+; RV32V-NEXT:    addi sp, sp, 16
 ; RV32V-NEXT:    ret
 ;
-; RV64V-LABEL: clmulh_nxv1i16_vv:
+; RV64V-LABEL: clmulh_nxv64i8_vv:
 ; RV64V:       # %bb.0:
-; RV64V-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
-; RV64V-NEXT:    vzext.vf2 v10, v8
-; RV64V-NEXT:    vzext.vf2 v8, v9
-; RV64V-NEXT:    li a0, 16
-; RV64V-NEXT:    vand.vi v9, v8, 2
-; RV64V-NEXT:    vand.vi v11, v8, 1
-; RV64V-NEXT:    vmul.vv v9, v10, v9
-; RV64V-NEXT:    vmul.vv v11, v10, v11
-; RV64V-NEXT:    vxor.vv v9, v11, v9
-; RV64V-NEXT:    vand.vi v11, v8, 4
-; RV64V-NEXT:    vmul.vv v11, v10, v11
-; RV64V-NEXT:    vxor.vv v9, v9, v11
-; RV64V-NEXT:    vand.vi v11, v8, 8
-; RV64V-NEXT:    vmul.vv v11, v10, v11
-; RV64V-NEXT:    vxor.vv v9, v9, v11
-; RV64V-NEXT:    vand.vx v11, v8, a0
-; RV64V-NEXT:    li a0, 32
-; RV64V-NEXT:    vmul.vv v11, v10, v11
-; RV64V-NEXT:    vxor.vv v9, v9, v11
-; RV64V-NEXT:    vand.vx v11, v8, a0
-; RV64V-NEXT:    li a0, 64
-; RV64V-NEXT:    vmul.vv v11, v10, v11
-; RV64V-NEXT:    vxor.vv v9, v9, v11
-; RV64V-NEXT:    vand.vx v11, v8, a0
-; RV64V-NEXT:    li a0, 128
-; RV64V-NEXT:    vmul.vv v11, v10, v11
-; RV64V-NEXT:    vxor.vv v9, v9, v11
-; RV64V-NEXT:    vand.vx v11, v8, a0
-; RV64V-NEXT:    li a0, 256
-; RV64V-NEXT:    vmul.vv v11, v10, v11
-; RV64V-NEXT:    vxor.vv v9, v9, v11
-; RV64V-NEXT:    vand.vx v11, v8, a0
-; RV64V-NEXT:    li a0, 512
-; RV64V-NEXT:    vmul.vv v11, v10, v11
-; RV64V-NEXT:    vxor.vv v9, v9, v11
-; RV64V-NEXT:    vand.vx v11, v8, a0
-; RV64V-NEXT:    li a0, 1024
-; RV64V-NEXT:    vmul.vv v11, v10, v11
-; RV64V-NEXT:    vxor.vv v9, v9, v11
-; RV64V-NEXT:    vand.vx v11, v8, a0
-; RV64V-NEXT:    li a0, 1
-; RV64V-NEXT:    slli a0, a0, 11
-; RV64V-NEXT:    vmul.vv v11, v10, v11
-; RV64V-NEXT:    vxor.vv v9, v9, v11
-; RV64V-NEXT:    vand.vx v11, v8, a0
-; RV64V-NEXT:    lui a0, 1
-; RV64V-NEXT:    vmul.vv v11, v10, v11
-; RV64V-NEXT:    vxor.vv v9, v9, v11
-; RV64V-NEXT:    vand.vx v11, v8, a0
-; RV64V-NEXT:    lui a0, 2
-; RV64V-NEXT:    vmul.vv v11, v10, v11
-; RV64V-NEXT:    vxor.vv v9, v9, v11
-; RV64V-NEXT:    vand.vx v11, v8, a0
-; RV64V-NEXT:    lui a0, 4
-; RV64V-NEXT:    vmul.vv v11, v10, v11
-; RV64V-NEXT:    vxor.vv v9, v9, v11
-; RV64V-NEXT:    vand.vx v11, v8, a0
-; RV64V-NEXT:    lui a0, 8
+; RV64V-NEXT:    addi sp, sp, -16
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    sub sp, sp, a0
+; RV64V-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
+; RV64V-NEXT:    vsll.vi v24, v8, 4
+; RV64V-NEXT:    vsrl.vi v8, v8, 4
+; RV64V-NEXT:    li a0, 51
+; RV64V-NEXT:    li a1, 85
+; RV64V-NEXT:    vsll.vi v0, v16, 4
+; RV64V-NEXT:    vsrl.vi v16, v16, 4
+; RV64V-NEXT:    li a2, 16
+; RV64V-NEXT:    li a3, 32
+; RV64V-NEXT:    vor.vv v16, v16, v0
+; RV64V-NEXT:    vsrl.vi v0, v16, 2
+; RV64V-NEXT:    vand.vx v16, v16, a0
+; RV64V-NEXT:    vand.vx v0, v0, a0
+; RV64V-NEXT:    vsll.vi v16, v16, 2
+; RV64V-NEXT:    vor.vv v16, v0, v16
+; RV64V-NEXT:    vsrl.vi v0, v16, 1
+; RV64V-NEXT:    vand.vx v16, v16, a1
+; RV64V-NEXT:    vand.vx v0, v0, a1
+; RV64V-NEXT:    vadd.vv v16, v16, v16
+; RV64V-NEXT:    vor.vv v0, v0, v16
+; RV64V-NEXT:    vand.vx v16, v0, a2
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 4
+; RV64V-NEXT:    mv a4, a2
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    add a2, a2, a4
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 16
+; RV64V-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    li a2, 64
+; RV64V-NEXT:    vor.vv v8, v8, v24
+; RV64V-NEXT:    vsrl.vi v16, v8, 2
 ; RV64V-NEXT:    vand.vx v8, v8, a0
-; RV64V-NEXT:    vmul.vv v11, v10, v11
-; RV64V-NEXT:    vxor.vv v9, v9, v11
-; RV64V-NEXT:    vmul.vv v8, v10, v8
-; RV64V-NEXT:    vxor.vv v8, v9, v8
-; RV64V-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; RV64V-NEXT:    vnsrl.wi v8, v8, 16
+; RV64V-NEXT:    vand.vx v16, v16, a0
+; RV64V-NEXT:    vsll.vi v8, v8, 2
+; RV64V-NEXT:    vor.vv v8, v16, v8
+; RV64V-NEXT:    vsrl.vi v16, v8, 1
+; RV64V-NEXT:    vand.vx v8, v8, a1
+; RV64V-NEXT:    vand.vx v16, v16, a1
+; RV64V-NEXT:    vadd.vv v8, v8, v8
+; RV64V-NEXT:    vor.vv v8, v16, v8
+; RV64V-NEXT:    vand.vx v16, v0, a3
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    li a3, 128
+; RV64V-NEXT:    vand.vx v16, v0, a2
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 5
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 16
+; RV64V-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vi v16, v0, 2
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 3
+; RV64V-NEXT:    mv a4, a2
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    add a2, a2, a4
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 16
+; RV64V-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vi v24, v0, 1
+; RV64V-NEXT:    vand.vi v16, v0, 4
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 3
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 16
+; RV64V-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vi v16, v0, 8
+; RV64V-NEXT:    addi a2, sp, 16
+; RV64V-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v0, v0, a3
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 4
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 16
+; RV64V-NEXT:    vs8r.v v0, (a2) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 3
+; RV64V-NEXT:    mv a3, a2
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    add a2, a2, a3
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 16
+; RV64V-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 3
+; RV64V-NEXT:    mv a3, a2
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    add a2, a2, a3
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 16
+; RV64V-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vmul.vv v24, v8, v24
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 3
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 16
+; RV64V-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 3
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 16
+; RV64V-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    addi a2, sp, 16
+; RV64V-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v0, v8, v16
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 4
+; RV64V-NEXT:    mv a3, a2
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    add a2, a2, a3
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 16
+; RV64V-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 4
+; RV64V-NEXT:    mv a3, a2
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    add a2, a2, a3
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 16
+; RV64V-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 3
+; RV64V-NEXT:    mv a3, a2
+; RV64V-NEXT:    slli a2, a2, 2
+; RV64V-NEXT:    add a2, a2, a3
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 16
+; RV64V-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 3
+; RV64V-NEXT:    mv a3, a2
+; RV64V-NEXT:    slli a2, a2, 2
+; RV64V-NEXT:    add a2, a2, a3
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 16
+; RV64V-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 5
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 16
+; RV64V-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 5
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 16
+; RV64V-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 4
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 16
+; RV64V-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v8, v8, v16
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 3
+; RV64V-NEXT:    mv a3, a2
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    add a2, a2, a3
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 16
+; RV64V-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v24, v24, v16
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 3
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 16
+; RV64V-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v16, v24, v16
+; RV64V-NEXT:    vxor.vv v16, v16, v0
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 4
+; RV64V-NEXT:    mv a3, a2
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    add a2, a2, a3
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 16
+; RV64V-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v24, v16, v24
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 3
+; RV64V-NEXT:    mv a3, a2
+; RV64V-NEXT:    slli a2, a2, 2
+; RV64V-NEXT:    add a2, a2, a3
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 16
+; RV64V-NEXT:    vl8r.v v0, (a2) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 5
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 16
+; RV64V-NEXT:    vl8r.v v0, (a2) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vxor.vv v8, v24, v8
+; RV64V-NEXT:    vsll.vi v16, v16, 4
+; RV64V-NEXT:    vsrl.vi v8, v8, 4
+; RV64V-NEXT:    vor.vv v8, v8, v16
+; RV64V-NEXT:    vsrl.vi v16, v8, 2
+; RV64V-NEXT:    vand.vx v8, v8, a0
+; RV64V-NEXT:    vand.vx v16, v16, a0
+; RV64V-NEXT:    vsll.vi v8, v8, 2
+; RV64V-NEXT:    vor.vv v8, v16, v8
+; RV64V-NEXT:    vsrl.vi v16, v8, 1
+; RV64V-NEXT:    vand.vx v8, v8, a1
+; RV64V-NEXT:    vand.vx v16, v16, a1
+; RV64V-NEXT:    vadd.vv v8, v8, v8
+; RV64V-NEXT:    vor.vv v8, v16, v8
+; RV64V-NEXT:    vsrl.vi v8, v8, 1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add sp, sp, a0
+; RV64V-NEXT:    addi sp, sp, 16
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC64-LABEL: clmulh_nxv1i16_vv:
+; RV32ZVBC64-LABEL: clmulh_nxv64i8_vv:
 ; RV32ZVBC64:       # %bb.0:
-; RV32ZVBC64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV32ZVBC64-NEXT:    vzext.vf4 v10, v9
-; RV32ZVBC64-NEXT:    vzext.vf4 v9, v8
-; RV32ZVBC64-NEXT:    vclmul.vv v8, v9, v10
-; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; RV32ZVBC64-NEXT:    vnsrl.wi v8, v8, 0
-; RV32ZVBC64-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; RV32ZVBC64-NEXT:    vnsrl.wi v8, v8, 16
+; RV32ZVBC64-NEXT:    addi sp, sp, -16
+; RV32ZVBC64-NEXT:    csrr a0, vlenb
+; RV32ZVBC64-NEXT:    slli a0, a0, 3
+; RV32ZVBC64-NEXT:    mv a1, a0
+; RV32ZVBC64-NEXT:    slli a0, a0, 1
+; RV32ZVBC64-NEXT:    add a1, a1, a0
+; RV32ZVBC64-NEXT:    slli a0, a0, 1
+; RV32ZVBC64-NEXT:    add a0, a0, a1
+; RV32ZVBC64-NEXT:    sub sp, sp, a0
+; RV32ZVBC64-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
+; RV32ZVBC64-NEXT:    vsll.vi v24, v8, 4
+; RV32ZVBC64-NEXT:    vsrl.vi v8, v8, 4
+; RV32ZVBC64-NEXT:    li a0, 51
+; RV32ZVBC64-NEXT:    li a1, 85
+; RV32ZVBC64-NEXT:    vsll.vi v0, v16, 4
+; RV32ZVBC64-NEXT:    vsrl.vi v16, v16, 4
+; RV32ZVBC64-NEXT:    li a2, 16
+; RV32ZVBC64-NEXT:    li a3, 32
+; RV32ZVBC64-NEXT:    vor.vv v16, v16, v0
+; RV32ZVBC64-NEXT:    vsrl.vi v0, v16, 2
+; RV32ZVBC64-NEXT:    vand.vx v16, v16, a0
+; RV32ZVBC64-NEXT:    vand.vx v0, v0, a0
+; RV32ZVBC64-NEXT:    vsll.vi v16, v16, 2
+; RV32ZVBC64-NEXT:    vor.vv v16, v0, v16
+; RV32ZVBC64-NEXT:    vsrl.vi v0, v16, 1
+; RV32ZVBC64-NEXT:    vand.vx v16, v16, a1
+; RV32ZVBC64-NEXT:    vand.vx v0, v0, a1
+; RV32ZVBC64-NEXT:    vadd.vv v16, v16, v16
+; RV32ZVBC64-NEXT:    vor.vv v0, v0, v16
+; RV32ZVBC64-NEXT:    vand.vx v16, v0, a2
+; RV32ZVBC64-NEXT:    csrr a2, vlenb
+; RV32ZVBC64-NEXT:    slli a2, a2, 4
+; RV32ZVBC64-NEXT:    mv a4, a2
+; RV32ZVBC64-NEXT:    slli a2, a2, 1
+; RV32ZVBC64-NEXT:    add a2, a2, a4
+; RV32ZVBC64-NEXT:    add a2, sp, a2
+; RV32ZVBC64-NEXT:    addi a2, a2, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    li a2, 64
+; RV32ZVBC64-NEXT:    vor.vv v8, v8, v24
+; RV32ZVBC64-NEXT:    vsrl.vi v16, v8, 2
+; RV32ZVBC64-NEXT:    vand.vx v8, v8, a0
+; RV32ZVBC64-NEXT:    vand.vx v16, v16, a0
+; RV32ZVBC64-NEXT:    vsll.vi v8, v8, 2
+; RV32ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV32ZVBC64-NEXT:    vsrl.vi v16, v8, 1
+; RV32ZVBC64-NEXT:    vand.vx v8, v8, a1
+; RV32ZVBC64-NEXT:    vand.vx v16, v16, a1
+; RV32ZVBC64-NEXT:    vadd.vv v8, v8, v8
+; RV32ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV32ZVBC64-NEXT:    vand.vx v16, v0, a3
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 3
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 2
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    li a3, 128
+; RV32ZVBC64-NEXT:    vand.vx v16, v0, a2
+; RV32ZVBC64-NEXT:    csrr a2, vlenb
+; RV32ZVBC64-NEXT:    slli a2, a2, 5
+; RV32ZVBC64-NEXT:    add a2, sp, a2
+; RV32ZVBC64-NEXT:    addi a2, a2, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    vand.vi v16, v0, 2
+; RV32ZVBC64-NEXT:    csrr a2, vlenb
+; RV32ZVBC64-NEXT:    slli a2, a2, 3
+; RV32ZVBC64-NEXT:    mv a4, a2
+; RV32ZVBC64-NEXT:    slli a2, a2, 1
+; RV32ZVBC64-NEXT:    add a2, a2, a4
+; RV32ZVBC64-NEXT:    add a2, sp, a2
+; RV32ZVBC64-NEXT:    addi a2, a2, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    vand.vi v24, v0, 1
+; RV32ZVBC64-NEXT:    vand.vi v16, v0, 4
+; RV32ZVBC64-NEXT:    csrr a2, vlenb
+; RV32ZVBC64-NEXT:    slli a2, a2, 3
+; RV32ZVBC64-NEXT:    add a2, sp, a2
+; RV32ZVBC64-NEXT:    addi a2, a2, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    vand.vi v16, v0, 8
+; RV32ZVBC64-NEXT:    addi a2, sp, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    vand.vx v0, v0, a3
+; RV32ZVBC64-NEXT:    csrr a2, vlenb
+; RV32ZVBC64-NEXT:    slli a2, a2, 4
+; RV32ZVBC64-NEXT:    add a2, sp, a2
+; RV32ZVBC64-NEXT:    addi a2, a2, 16
+; RV32ZVBC64-NEXT:    vs8r.v v0, (a2) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    csrr a2, vlenb
+; RV32ZVBC64-NEXT:    slli a2, a2, 3
+; RV32ZVBC64-NEXT:    mv a3, a2
+; RV32ZVBC64-NEXT:    slli a2, a2, 1
+; RV32ZVBC64-NEXT:    add a2, a2, a3
+; RV32ZVBC64-NEXT:    add a2, sp, a2
+; RV32ZVBC64-NEXT:    addi a2, a2, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC64-NEXT:    csrr a2, vlenb
+; RV32ZVBC64-NEXT:    slli a2, a2, 3
+; RV32ZVBC64-NEXT:    mv a3, a2
+; RV32ZVBC64-NEXT:    slli a2, a2, 1
+; RV32ZVBC64-NEXT:    add a2, a2, a3
+; RV32ZVBC64-NEXT:    add a2, sp, a2
+; RV32ZVBC64-NEXT:    addi a2, a2, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC64-NEXT:    csrr a2, vlenb
+; RV32ZVBC64-NEXT:    slli a2, a2, 3
+; RV32ZVBC64-NEXT:    add a2, sp, a2
+; RV32ZVBC64-NEXT:    addi a2, a2, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC64-NEXT:    csrr a2, vlenb
+; RV32ZVBC64-NEXT:    slli a2, a2, 3
+; RV32ZVBC64-NEXT:    add a2, sp, a2
+; RV32ZVBC64-NEXT:    addi a2, a2, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    addi a2, sp, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v16
+; RV32ZVBC64-NEXT:    csrr a2, vlenb
+; RV32ZVBC64-NEXT:    slli a2, a2, 4
+; RV32ZVBC64-NEXT:    mv a3, a2
+; RV32ZVBC64-NEXT:    slli a2, a2, 1
+; RV32ZVBC64-NEXT:    add a2, a2, a3
+; RV32ZVBC64-NEXT:    add a2, sp, a2
+; RV32ZVBC64-NEXT:    addi a2, a2, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC64-NEXT:    csrr a2, vlenb
+; RV32ZVBC64-NEXT:    slli a2, a2, 4
+; RV32ZVBC64-NEXT:    mv a3, a2
+; RV32ZVBC64-NEXT:    slli a2, a2, 1
+; RV32ZVBC64-NEXT:    add a2, a2, a3
+; RV32ZVBC64-NEXT:    add a2, sp, a2
+; RV32ZVBC64-NEXT:    addi a2, a2, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    csrr a2, vlenb
+; RV32ZVBC64-NEXT:    slli a2, a2, 3
+; RV32ZVBC64-NEXT:    mv a3, a2
+; RV32ZVBC64-NEXT:    slli a2, a2, 2
+; RV32ZVBC64-NEXT:    add a2, a2, a3
+; RV32ZVBC64-NEXT:    add a2, sp, a2
+; RV32ZVBC64-NEXT:    addi a2, a2, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC64-NEXT:    csrr a2, vlenb
+; RV32ZVBC64-NEXT:    slli a2, a2, 3
+; RV32ZVBC64-NEXT:    mv a3, a2
+; RV32ZVBC64-NEXT:    slli a2, a2, 2
+; RV32ZVBC64-NEXT:    add a2, a2, a3
+; RV32ZVBC64-NEXT:    add a2, sp, a2
+; RV32ZVBC64-NEXT:    addi a2, a2, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    csrr a2, vlenb
+; RV32ZVBC64-NEXT:    slli a2, a2, 5
+; RV32ZVBC64-NEXT:    add a2, sp, a2
+; RV32ZVBC64-NEXT:    addi a2, a2, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC64-NEXT:    csrr a2, vlenb
+; RV32ZVBC64-NEXT:    slli a2, a2, 5
+; RV32ZVBC64-NEXT:    add a2, sp, a2
+; RV32ZVBC64-NEXT:    addi a2, a2, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    csrr a2, vlenb
+; RV32ZVBC64-NEXT:    slli a2, a2, 4
+; RV32ZVBC64-NEXT:    add a2, sp, a2
+; RV32ZVBC64-NEXT:    addi a2, a2, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v8, v8, v16
+; RV32ZVBC64-NEXT:    csrr a2, vlenb
+; RV32ZVBC64-NEXT:    slli a2, a2, 3
+; RV32ZVBC64-NEXT:    mv a3, a2
+; RV32ZVBC64-NEXT:    slli a2, a2, 1
+; RV32ZVBC64-NEXT:    add a2, a2, a3
+; RV32ZVBC64-NEXT:    add a2, sp, a2
+; RV32ZVBC64-NEXT:    addi a2, a2, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v16
+; RV32ZVBC64-NEXT:    csrr a2, vlenb
+; RV32ZVBC64-NEXT:    slli a2, a2, 3
+; RV32ZVBC64-NEXT:    add a2, sp, a2
+; RV32ZVBC64-NEXT:    addi a2, a2, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vxor.vv v16, v24, v16
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v0
+; RV32ZVBC64-NEXT:    csrr a2, vlenb
+; RV32ZVBC64-NEXT:    slli a2, a2, 4
+; RV32ZVBC64-NEXT:    mv a3, a2
+; RV32ZVBC64-NEXT:    slli a2, a2, 1
+; RV32ZVBC64-NEXT:    add a2, a2, a3
+; RV32ZVBC64-NEXT:    add a2, sp, a2
+; RV32ZVBC64-NEXT:    addi a2, a2, 16
+; RV32ZVBC64-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vxor.vv v24, v16, v24
+; RV32ZVBC64-NEXT:    csrr a2, vlenb
+; RV32ZVBC64-NEXT:    slli a2, a2, 3
+; RV32ZVBC64-NEXT:    mv a3, a2
+; RV32ZVBC64-NEXT:    slli a2, a2, 2
+; RV32ZVBC64-NEXT:    add a2, a2, a3
+; RV32ZVBC64-NEXT:    add a2, sp, a2
+; RV32ZVBC64-NEXT:    addi a2, a2, 16
+; RV32ZVBC64-NEXT:    vl8r.v v0, (a2) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    csrr a2, vlenb
+; RV32ZVBC64-NEXT:    slli a2, a2, 5
+; RV32ZVBC64-NEXT:    add a2, sp, a2
+; RV32ZVBC64-NEXT:    addi a2, a2, 16
+; RV32ZVBC64-NEXT:    vl8r.v v0, (a2) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vxor.vv v8, v24, v8
+; RV32ZVBC64-NEXT:    vsll.vi v16, v16, 4
+; RV32ZVBC64-NEXT:    vsrl.vi v8, v8, 4
+; RV32ZVBC64-NEXT:    vor.vv v8, v8, v16
+; RV32ZVBC64-NEXT:    vsrl.vi v16, v8, 2
+; RV32ZVBC64-NEXT:    vand.vx v8, v8, a0
+; RV32ZVBC64-NEXT:    vand.vx v16, v16, a0
+; RV32ZVBC64-NEXT:    vsll.vi v8, v8, 2
+; RV32ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV32ZVBC64-NEXT:    vsrl.vi v16, v8, 1
+; RV32ZVBC64-NEXT:    vand.vx v8, v8, a1
+; RV32ZVBC64-NEXT:    vand.vx v16, v16, a1
+; RV32ZVBC64-NEXT:    vadd.vv v8, v8, v8
+; RV32ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV32ZVBC64-NEXT:    vsrl.vi v8, v8, 1
+; RV32ZVBC64-NEXT:    csrr a0, vlenb
+; RV32ZVBC64-NEXT:    slli a0, a0, 3
+; RV32ZVBC64-NEXT:    mv a1, a0
+; RV32ZVBC64-NEXT:    slli a0, a0, 1
+; RV32ZVBC64-NEXT:    add a1, a1, a0
+; RV32ZVBC64-NEXT:    slli a0, a0, 1
+; RV32ZVBC64-NEXT:    add a0, a0, a1
+; RV32ZVBC64-NEXT:    add sp, sp, a0
+; RV32ZVBC64-NEXT:    addi sp, sp, 16
 ; RV32ZVBC64-NEXT:    ret
 ;
-; RV64ZVBC64-LABEL: clmulh_nxv1i16_vv:
+; RV64ZVBC64-LABEL: clmulh_nxv64i8_vv:
 ; RV64ZVBC64:       # %bb.0:
-; RV64ZVBC64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV64ZVBC64-NEXT:    vzext.vf4 v10, v9
-; RV64ZVBC64-NEXT:    vzext.vf4 v9, v8
-; RV64ZVBC64-NEXT:    vclmul.vv v8, v9, v10
-; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; RV64ZVBC64-NEXT:    vnsrl.wi v8, v8, 0
-; RV64ZVBC64-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; RV64ZVBC64-NEXT:    vnsrl.wi v8, v8, 16
+; RV64ZVBC64-NEXT:    addi sp, sp, -16
+; RV64ZVBC64-NEXT:    csrr a0, vlenb
+; RV64ZVBC64-NEXT:    slli a0, a0, 3
+; RV64ZVBC64-NEXT:    mv a1, a0
+; RV64ZVBC64-NEXT:    slli a0, a0, 1
+; RV64ZVBC64-NEXT:    add a1, a1, a0
+; RV64ZVBC64-NEXT:    slli a0, a0, 1
+; RV64ZVBC64-NEXT:    add a0, a0, a1
+; RV64ZVBC64-NEXT:    sub sp, sp, a0
+; RV64ZVBC64-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
+; RV64ZVBC64-NEXT:    vsll.vi v24, v8, 4
+; RV64ZVBC64-NEXT:    vsrl.vi v8, v8, 4
+; RV64ZVBC64-NEXT:    li a0, 51
+; RV64ZVBC64-NEXT:    li a1, 85
+; RV64ZVBC64-NEXT:    vsll.vi v0, v16, 4
+; RV64ZVBC64-NEXT:    vsrl.vi v16, v16, 4
+; RV64ZVBC64-NEXT:    li a2, 16
+; RV64ZVBC64-NEXT:    li a3, 32
+; RV64ZVBC64-NEXT:    vor.vv v16, v16, v0
+; RV64ZVBC64-NEXT:    vsrl.vi v0, v16, 2
+; RV64ZVBC64-NEXT:    vand.vx v16, v16, a0
+; RV64ZVBC64-NEXT:    vand.vx v0, v0, a0
+; RV64ZVBC64-NEXT:    vsll.vi v16, v16, 2
+; RV64ZVBC64-NEXT:    vor.vv v16, v0, v16
+; RV64ZVBC64-NEXT:    vsrl.vi v0, v16, 1
+; RV64ZVBC64-NEXT:    vand.vx v16, v16, a1
+; RV64ZVBC64-NEXT:    vand.vx v0, v0, a1
+; RV64ZVBC64-NEXT:    vadd.vv v16, v16, v16
+; RV64ZVBC64-NEXT:    vor.vv v0, v0, v16
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, a2
+; RV64ZVBC64-NEXT:    csrr a2, vlenb
+; RV64ZVBC64-NEXT:    slli a2, a2, 4
+; RV64ZVBC64-NEXT:    mv a4, a2
+; RV64ZVBC64-NEXT:    slli a2, a2, 1
+; RV64ZVBC64-NEXT:    add a2, a2, a4
+; RV64ZVBC64-NEXT:    add a2, sp, a2
+; RV64ZVBC64-NEXT:    addi a2, a2, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    li a2, 64
+; RV64ZVBC64-NEXT:    vor.vv v8, v8, v24
+; RV64ZVBC64-NEXT:    vsrl.vi v16, v8, 2
+; RV64ZVBC64-NEXT:    vand.vx v8, v8, a0
+; RV64ZVBC64-NEXT:    vand.vx v16, v16, a0
+; RV64ZVBC64-NEXT:    vsll.vi v8, v8, 2
+; RV64ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV64ZVBC64-NEXT:    vsrl.vi v16, v8, 1
+; RV64ZVBC64-NEXT:    vand.vx v8, v8, a1
+; RV64ZVBC64-NEXT:    vand.vx v16, v16, a1
+; RV64ZVBC64-NEXT:    vadd.vv v8, v8, v8
+; RV64ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, a3
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 3
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 2
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    li a3, 128
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, a2
+; RV64ZVBC64-NEXT:    csrr a2, vlenb
+; RV64ZVBC64-NEXT:    slli a2, a2, 5
+; RV64ZVBC64-NEXT:    add a2, sp, a2
+; RV64ZVBC64-NEXT:    addi a2, a2, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vi v16, v0, 2
+; RV64ZVBC64-NEXT:    csrr a2, vlenb
+; RV64ZVBC64-NEXT:    slli a2, a2, 3
+; RV64ZVBC64-NEXT:    mv a4, a2
+; RV64ZVBC64-NEXT:    slli a2, a2, 1
+; RV64ZVBC64-NEXT:    add a2, a2, a4
+; RV64ZVBC64-NEXT:    add a2, sp, a2
+; RV64ZVBC64-NEXT:    addi a2, a2, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vi v24, v0, 1
+; RV64ZVBC64-NEXT:    vand.vi v16, v0, 4
+; RV64ZVBC64-NEXT:    csrr a2, vlenb
+; RV64ZVBC64-NEXT:    slli a2, a2, 3
+; RV64ZVBC64-NEXT:    add a2, sp, a2
+; RV64ZVBC64-NEXT:    addi a2, a2, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vi v16, v0, 8
+; RV64ZVBC64-NEXT:    addi a2, sp, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vx v0, v0, a3
+; RV64ZVBC64-NEXT:    csrr a2, vlenb
+; RV64ZVBC64-NEXT:    slli a2, a2, 4
+; RV64ZVBC64-NEXT:    add a2, sp, a2
+; RV64ZVBC64-NEXT:    addi a2, a2, 16
+; RV64ZVBC64-NEXT:    vs8r.v v0, (a2) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a2, vlenb
+; RV64ZVBC64-NEXT:    slli a2, a2, 3
+; RV64ZVBC64-NEXT:    mv a3, a2
+; RV64ZVBC64-NEXT:    slli a2, a2, 1
+; RV64ZVBC64-NEXT:    add a2, a2, a3
+; RV64ZVBC64-NEXT:    add a2, sp, a2
+; RV64ZVBC64-NEXT:    addi a2, a2, 16
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a2, vlenb
+; RV64ZVBC64-NEXT:    slli a2, a2, 3
+; RV64ZVBC64-NEXT:    mv a3, a2
+; RV64ZVBC64-NEXT:    slli a2, a2, 1
+; RV64ZVBC64-NEXT:    add a2, a2, a3
+; RV64ZVBC64-NEXT:    add a2, sp, a2
+; RV64ZVBC64-NEXT:    addi a2, a2, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vmul.vv v24, v8, v24
+; RV64ZVBC64-NEXT:    csrr a2, vlenb
+; RV64ZVBC64-NEXT:    slli a2, a2, 3
+; RV64ZVBC64-NEXT:    add a2, sp, a2
+; RV64ZVBC64-NEXT:    addi a2, a2, 16
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a2, vlenb
+; RV64ZVBC64-NEXT:    slli a2, a2, 3
+; RV64ZVBC64-NEXT:    add a2, sp, a2
+; RV64ZVBC64-NEXT:    addi a2, a2, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    addi a2, sp, 16
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v16
+; RV64ZVBC64-NEXT:    csrr a2, vlenb
+; RV64ZVBC64-NEXT:    slli a2, a2, 4
+; RV64ZVBC64-NEXT:    mv a3, a2
+; RV64ZVBC64-NEXT:    slli a2, a2, 1
+; RV64ZVBC64-NEXT:    add a2, a2, a3
+; RV64ZVBC64-NEXT:    add a2, sp, a2
+; RV64ZVBC64-NEXT:    addi a2, a2, 16
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a2, vlenb
+; RV64ZVBC64-NEXT:    slli a2, a2, 4
+; RV64ZVBC64-NEXT:    mv a3, a2
+; RV64ZVBC64-NEXT:    slli a2, a2, 1
+; RV64ZVBC64-NEXT:    add a2, a2, a3
+; RV64ZVBC64-NEXT:    add a2, sp, a2
+; RV64ZVBC64-NEXT:    addi a2, a2, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a2, vlenb
+; RV64ZVBC64-NEXT:    slli a2, a2, 3
+; RV64ZVBC64-NEXT:    mv a3, a2
+; RV64ZVBC64-NEXT:    slli a2, a2, 2
+; RV64ZVBC64-NEXT:    add a2, a2, a3
+; RV64ZVBC64-NEXT:    add a2, sp, a2
+; RV64ZVBC64-NEXT:    addi a2, a2, 16
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a2, vlenb
+; RV64ZVBC64-NEXT:    slli a2, a2, 3
+; RV64ZVBC64-NEXT:    mv a3, a2
+; RV64ZVBC64-NEXT:    slli a2, a2, 2
+; RV64ZVBC64-NEXT:    add a2, a2, a3
+; RV64ZVBC64-NEXT:    add a2, sp, a2
+; RV64ZVBC64-NEXT:    addi a2, a2, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a2, vlenb
+; RV64ZVBC64-NEXT:    slli a2, a2, 5
+; RV64ZVBC64-NEXT:    add a2, sp, a2
+; RV64ZVBC64-NEXT:    addi a2, a2, 16
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a2, vlenb
+; RV64ZVBC64-NEXT:    slli a2, a2, 5
+; RV64ZVBC64-NEXT:    add a2, sp, a2
+; RV64ZVBC64-NEXT:    addi a2, a2, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a2, vlenb
+; RV64ZVBC64-NEXT:    slli a2, a2, 4
+; RV64ZVBC64-NEXT:    add a2, sp, a2
+; RV64ZVBC64-NEXT:    addi a2, a2, 16
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v8, v8, v16
+; RV64ZVBC64-NEXT:    csrr a2, vlenb
+; RV64ZVBC64-NEXT:    slli a2, a2, 3
+; RV64ZVBC64-NEXT:    mv a3, a2
+; RV64ZVBC64-NEXT:    slli a2, a2, 1
+; RV64ZVBC64-NEXT:    add a2, a2, a3
+; RV64ZVBC64-NEXT:    add a2, sp, a2
+; RV64ZVBC64-NEXT:    addi a2, a2, 16
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v16
+; RV64ZVBC64-NEXT:    csrr a2, vlenb
+; RV64ZVBC64-NEXT:    slli a2, a2, 3
+; RV64ZVBC64-NEXT:    add a2, sp, a2
+; RV64ZVBC64-NEXT:    addi a2, a2, 16
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v16, v24, v16
+; RV64ZVBC64-NEXT:    vxor.vv v16, v16, v0
+; RV64ZVBC64-NEXT:    csrr a2, vlenb
+; RV64ZVBC64-NEXT:    slli a2, a2, 4
+; RV64ZVBC64-NEXT:    mv a3, a2
+; RV64ZVBC64-NEXT:    slli a2, a2, 1
+; RV64ZVBC64-NEXT:    add a2, a2, a3
+; RV64ZVBC64-NEXT:    add a2, sp, a2
+; RV64ZVBC64-NEXT:    addi a2, a2, 16
+; RV64ZVBC64-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v24, v16, v24
+; RV64ZVBC64-NEXT:    csrr a2, vlenb
+; RV64ZVBC64-NEXT:    slli a2, a2, 3
+; RV64ZVBC64-NEXT:    mv a3, a2
+; RV64ZVBC64-NEXT:    slli a2, a2, 2
+; RV64ZVBC64-NEXT:    add a2, a2, a3
+; RV64ZVBC64-NEXT:    add a2, sp, a2
+; RV64ZVBC64-NEXT:    addi a2, a2, 16
+; RV64ZVBC64-NEXT:    vl8r.v v0, (a2) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    csrr a2, vlenb
+; RV64ZVBC64-NEXT:    slli a2, a2, 5
+; RV64ZVBC64-NEXT:    add a2, sp, a2
+; RV64ZVBC64-NEXT:    addi a2, a2, 16
+; RV64ZVBC64-NEXT:    vl8r.v v0, (a2) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vxor.vv v8, v24, v8
+; RV64ZVBC64-NEXT:    vsll.vi v16, v16, 4
+; RV64ZVBC64-NEXT:    vsrl.vi v8, v8, 4
+; RV64ZVBC64-NEXT:    vor.vv v8, v8, v16
+; RV64ZVBC64-NEXT:    vsrl.vi v16, v8, 2
+; RV64ZVBC64-NEXT:    vand.vx v8, v8, a0
+; RV64ZVBC64-NEXT:    vand.vx v16, v16, a0
+; RV64ZVBC64-NEXT:    vsll.vi v8, v8, 2
+; RV64ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV64ZVBC64-NEXT:    vsrl.vi v16, v8, 1
+; RV64ZVBC64-NEXT:    vand.vx v8, v8, a1
+; RV64ZVBC64-NEXT:    vand.vx v16, v16, a1
+; RV64ZVBC64-NEXT:    vadd.vv v8, v8, v8
+; RV64ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV64ZVBC64-NEXT:    vsrl.vi v8, v8, 1
+; RV64ZVBC64-NEXT:    csrr a0, vlenb
+; RV64ZVBC64-NEXT:    slli a0, a0, 3
+; RV64ZVBC64-NEXT:    mv a1, a0
+; RV64ZVBC64-NEXT:    slli a0, a0, 1
+; RV64ZVBC64-NEXT:    add a1, a1, a0
+; RV64ZVBC64-NEXT:    slli a0, a0, 1
+; RV64ZVBC64-NEXT:    add a0, a0, a1
+; RV64ZVBC64-NEXT:    add sp, sp, a0
+; RV64ZVBC64-NEXT:    addi sp, sp, 16
 ; RV64ZVBC64-NEXT:    ret
 ;
-; RV32ZVBC32-LABEL: clmulh_nxv1i16_vv:
+; RV32ZVBC32-LABEL: clmulh_nxv64i8_vv:
 ; RV32ZVBC32:       # %bb.0:
-; RV32ZVBC32-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
-; RV32ZVBC32-NEXT:    vzext.vf2 v10, v8
-; RV32ZVBC32-NEXT:    vzext.vf2 v8, v9
-; RV32ZVBC32-NEXT:    li a0, 16
-; RV32ZVBC32-NEXT:    vand.vi v9, v8, 2
-; RV32ZVBC32-NEXT:    vand.vi v11, v8, 1
-; RV32ZVBC32-NEXT:    vmul.vv v9, v10, v9
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v11, v9
-; RV32ZVBC32-NEXT:    vand.vi v11, v8, 4
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV32ZVBC32-NEXT:    vand.vi v11, v8, 8
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV32ZVBC32-NEXT:    li a0, 32
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV32ZVBC32-NEXT:    li a0, 64
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV32ZVBC32-NEXT:    li a0, 128
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV32ZVBC32-NEXT:    li a0, 256
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV32ZVBC32-NEXT:    li a0, 512
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV32ZVBC32-NEXT:    li a0, 1024
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV32ZVBC32-NEXT:    li a0, 1
-; RV32ZVBC32-NEXT:    slli a0, a0, 11
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV32ZVBC32-NEXT:    lui a0, 1
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV32ZVBC32-NEXT:    lui a0, 2
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV32ZVBC32-NEXT:    lui a0, 4
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV32ZVBC32-NEXT:    lui a0, 8
-; RV32ZVBC32-NEXT:    vand.vx v8, v8, a0
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
-; RV32ZVBC32-NEXT:    vxor.vv v8, v9, v8
-; RV32ZVBC32-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; RV32ZVBC32-NEXT:    vnsrl.wi v8, v8, 16
+; RV32ZVBC32-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
+; RV32ZVBC32-NEXT:    vclmulh.vv v8, v8, v16
 ; RV32ZVBC32-NEXT:    ret
 ;
-; RV64ZVBC32-LABEL: clmulh_nxv1i16_vv:
+; RV64ZVBC32-LABEL: clmulh_nxv64i8_vv:
 ; RV64ZVBC32:       # %bb.0:
-; RV64ZVBC32-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
-; RV64ZVBC32-NEXT:    vzext.vf2 v10, v8
-; RV64ZVBC32-NEXT:    vzext.vf2 v8, v9
-; RV64ZVBC32-NEXT:    li a0, 16
-; RV64ZVBC32-NEXT:    vand.vi v9, v8, 2
-; RV64ZVBC32-NEXT:    vand.vi v11, v8, 1
-; RV64ZVBC32-NEXT:    vmul.vv v9, v10, v9
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v11, v9
-; RV64ZVBC32-NEXT:    vand.vi v11, v8, 4
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vi v11, v8, 8
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV64ZVBC32-NEXT:    li a0, 32
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV64ZVBC32-NEXT:    li a0, 64
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV64ZVBC32-NEXT:    li a0, 128
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV64ZVBC32-NEXT:    li a0, 256
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV64ZVBC32-NEXT:    li a0, 512
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV64ZVBC32-NEXT:    li a0, 1024
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV64ZVBC32-NEXT:    li a0, 1
-; RV64ZVBC32-NEXT:    slli a0, a0, 11
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV64ZVBC32-NEXT:    lui a0, 1
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV64ZVBC32-NEXT:    lui a0, 2
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV64ZVBC32-NEXT:    lui a0, 4
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV64ZVBC32-NEXT:    lui a0, 8
-; RV64ZVBC32-NEXT:    vand.vx v8, v8, a0
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vmul.vv v8, v10, v8
-; RV64ZVBC32-NEXT:    vxor.vv v8, v9, v8
-; RV64ZVBC32-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; RV64ZVBC32-NEXT:    vnsrl.wi v8, v8, 16
+; RV64ZVBC32-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
+; RV64ZVBC32-NEXT:    vclmulh.vv v8, v8, v16
 ; RV64ZVBC32-NEXT:    ret
-  %va.ext = zext <vscale x 1 x i16> %va to <vscale x 1 x i32>
-  %vb.ext = zext <vscale x 1 x i16> %vb to <vscale x 1 x i32>
-  %clmul = call <vscale x 1 x i32> @llvm.clmul.nxv1i32(<vscale x 1 x i32> %va.ext, <vscale x 1 x i32> %vb.ext)
-  %res.ext = lshr <vscale x 1 x i32> %clmul, splat(i32 16)
-  %res = trunc <vscale x 1 x i32> %res.ext to <vscale x 1 x i16>
-  ret <vscale x 1 x i16> %res
+  %va.ext = zext <vscale x 64 x i8> %va to <vscale x 64 x i16>
+  %vb.ext = zext <vscale x 64 x i8> %vb to <vscale x 64 x i16>
+  %clmul = call <vscale x 64 x i16> @llvm.clmul.nxv64i16(<vscale x 64 x i16> %va.ext, <vscale x 64 x i16> %vb.ext)
+  %res.ext = lshr <vscale x 64 x i16> %clmul, splat(i16 8)
+  %res = trunc <vscale x 64 x i16> %res.ext to <vscale x 64 x i8>
+  ret <vscale x 64 x i8> %res
 }
 
-define <vscale x 1 x i16> @clmulh_nxv1i16_vx(<vscale x 1 x i16> %va, i16 %b) nounwind {
-; RV32V-LABEL: clmulh_nxv1i16_vx:
+define <vscale x 64 x i8> @clmulh_nxv64i8_vx(<vscale x 64 x i8> %va, i8 %b) nounwind {
+; RV32V-LABEL: clmulh_nxv64i8_vx:
 ; RV32V:       # %bb.0:
-; RV32V-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; RV32V-NEXT:    vmv.v.x v10, a0
-; RV32V-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; RV32V-NEXT:    vzext.vf2 v9, v8
+; RV32V-NEXT:    addi sp, sp, -16
+; RV32V-NEXT:    csrr a1, vlenb
+; RV32V-NEXT:    slli a1, a1, 3
+; RV32V-NEXT:    mv a2, a1
+; RV32V-NEXT:    slli a1, a1, 1
+; RV32V-NEXT:    add a2, a2, a1
+; RV32V-NEXT:    slli a1, a1, 1
+; RV32V-NEXT:    add a1, a1, a2
+; RV32V-NEXT:    sub sp, sp, a1
+; RV32V-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
+; RV32V-NEXT:    vmv.v.x v24, a0
+; RV32V-NEXT:    vsll.vi v16, v8, 4
+; RV32V-NEXT:    vsrl.vi v8, v8, 4
+; RV32V-NEXT:    li a0, 51
+; RV32V-NEXT:    li a1, 85
+; RV32V-NEXT:    li a2, 16
+; RV32V-NEXT:    li a3, 32
+; RV32V-NEXT:    vsll.vi v0, v24, 4
+; RV32V-NEXT:    vsrl.vi v24, v24, 4
+; RV32V-NEXT:    vor.vv v24, v24, v0
+; RV32V-NEXT:    vsrl.vi v0, v24, 2
+; RV32V-NEXT:    vand.vx v24, v24, a0
+; RV32V-NEXT:    vand.vx v0, v0, a0
+; RV32V-NEXT:    vsll.vi v24, v24, 2
+; RV32V-NEXT:    vor.vv v24, v0, v24
+; RV32V-NEXT:    vsrl.vi v0, v24, 1
+; RV32V-NEXT:    vand.vx v24, v24, a1
+; RV32V-NEXT:    vand.vx v0, v0, a1
+; RV32V-NEXT:    vadd.vv v24, v24, v24
+; RV32V-NEXT:    vor.vv v0, v0, v24
+; RV32V-NEXT:    vand.vx v24, v0, a2
+; RV32V-NEXT:    csrr a2, vlenb
+; RV32V-NEXT:    slli a2, a2, 4
+; RV32V-NEXT:    mv a4, a2
+; RV32V-NEXT:    slli a2, a2, 1
+; RV32V-NEXT:    add a2, a2, a4
+; RV32V-NEXT:    add a2, sp, a2
+; RV32V-NEXT:    addi a2, a2, 16
+; RV32V-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    li a2, 64
+; RV32V-NEXT:    vor.vv v8, v8, v16
+; RV32V-NEXT:    vsrl.vi v16, v8, 2
+; RV32V-NEXT:    vand.vx v8, v8, a0
+; RV32V-NEXT:    vand.vx v16, v16, a0
+; RV32V-NEXT:    vsll.vi v8, v8, 2
+; RV32V-NEXT:    vor.vv v8, v16, v8
+; RV32V-NEXT:    vsrl.vi v16, v8, 1
+; RV32V-NEXT:    vand.vx v8, v8, a1
+; RV32V-NEXT:    vand.vx v16, v16, a1
+; RV32V-NEXT:    vadd.vv v8, v8, v8
+; RV32V-NEXT:    vor.vv v8, v16, v8
+; RV32V-NEXT:    vand.vx v16, v0, a3
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 3
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 2
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    li a3, 128
+; RV32V-NEXT:    vand.vx v16, v0, a2
+; RV32V-NEXT:    csrr a2, vlenb
+; RV32V-NEXT:    slli a2, a2, 5
+; RV32V-NEXT:    add a2, sp, a2
+; RV32V-NEXT:    addi a2, a2, 16
+; RV32V-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vi v16, v0, 2
+; RV32V-NEXT:    csrr a2, vlenb
+; RV32V-NEXT:    slli a2, a2, 3
+; RV32V-NEXT:    mv a4, a2
+; RV32V-NEXT:    slli a2, a2, 1
+; RV32V-NEXT:    add a2, a2, a4
+; RV32V-NEXT:    add a2, sp, a2
+; RV32V-NEXT:    addi a2, a2, 16
+; RV32V-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vi v24, v0, 1
+; RV32V-NEXT:    vand.vi v16, v0, 4
+; RV32V-NEXT:    csrr a2, vlenb
+; RV32V-NEXT:    slli a2, a2, 3
+; RV32V-NEXT:    add a2, sp, a2
+; RV32V-NEXT:    addi a2, a2, 16
+; RV32V-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vi v16, v0, 8
+; RV32V-NEXT:    addi a2, sp, 16
+; RV32V-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v0, v0, a3
+; RV32V-NEXT:    csrr a2, vlenb
+; RV32V-NEXT:    slli a2, a2, 4
+; RV32V-NEXT:    add a2, sp, a2
+; RV32V-NEXT:    addi a2, a2, 16
+; RV32V-NEXT:    vs8r.v v0, (a2) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a2, vlenb
+; RV32V-NEXT:    slli a2, a2, 3
+; RV32V-NEXT:    mv a3, a2
+; RV32V-NEXT:    slli a2, a2, 1
+; RV32V-NEXT:    add a2, a2, a3
+; RV32V-NEXT:    add a2, sp, a2
+; RV32V-NEXT:    addi a2, a2, 16
+; RV32V-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    csrr a2, vlenb
+; RV32V-NEXT:    slli a2, a2, 3
+; RV32V-NEXT:    mv a3, a2
+; RV32V-NEXT:    slli a2, a2, 1
+; RV32V-NEXT:    add a2, a2, a3
+; RV32V-NEXT:    add a2, sp, a2
+; RV32V-NEXT:    addi a2, a2, 16
+; RV32V-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vmul.vv v24, v8, v24
+; RV32V-NEXT:    csrr a2, vlenb
+; RV32V-NEXT:    slli a2, a2, 3
+; RV32V-NEXT:    add a2, sp, a2
+; RV32V-NEXT:    addi a2, a2, 16
+; RV32V-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    csrr a2, vlenb
+; RV32V-NEXT:    slli a2, a2, 3
+; RV32V-NEXT:    add a2, sp, a2
+; RV32V-NEXT:    addi a2, a2, 16
+; RV32V-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    addi a2, sp, 16
+; RV32V-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v0, v8, v16
+; RV32V-NEXT:    csrr a2, vlenb
+; RV32V-NEXT:    slli a2, a2, 4
+; RV32V-NEXT:    mv a3, a2
+; RV32V-NEXT:    slli a2, a2, 1
+; RV32V-NEXT:    add a2, a2, a3
+; RV32V-NEXT:    add a2, sp, a2
+; RV32V-NEXT:    addi a2, a2, 16
+; RV32V-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    csrr a2, vlenb
+; RV32V-NEXT:    slli a2, a2, 4
+; RV32V-NEXT:    mv a3, a2
+; RV32V-NEXT:    slli a2, a2, 1
+; RV32V-NEXT:    add a2, a2, a3
+; RV32V-NEXT:    add a2, sp, a2
+; RV32V-NEXT:    addi a2, a2, 16
+; RV32V-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a2, vlenb
+; RV32V-NEXT:    slli a2, a2, 3
+; RV32V-NEXT:    mv a3, a2
+; RV32V-NEXT:    slli a2, a2, 2
+; RV32V-NEXT:    add a2, a2, a3
+; RV32V-NEXT:    add a2, sp, a2
+; RV32V-NEXT:    addi a2, a2, 16
+; RV32V-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    csrr a2, vlenb
+; RV32V-NEXT:    slli a2, a2, 3
+; RV32V-NEXT:    mv a3, a2
+; RV32V-NEXT:    slli a2, a2, 2
+; RV32V-NEXT:    add a2, a2, a3
+; RV32V-NEXT:    add a2, sp, a2
+; RV32V-NEXT:    addi a2, a2, 16
+; RV32V-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a2, vlenb
+; RV32V-NEXT:    slli a2, a2, 5
+; RV32V-NEXT:    add a2, sp, a2
+; RV32V-NEXT:    addi a2, a2, 16
+; RV32V-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    csrr a2, vlenb
+; RV32V-NEXT:    slli a2, a2, 5
+; RV32V-NEXT:    add a2, sp, a2
+; RV32V-NEXT:    addi a2, a2, 16
+; RV32V-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a2, vlenb
+; RV32V-NEXT:    slli a2, a2, 4
+; RV32V-NEXT:    add a2, sp, a2
+; RV32V-NEXT:    addi a2, a2, 16
+; RV32V-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v8, v16
+; RV32V-NEXT:    csrr a2, vlenb
+; RV32V-NEXT:    slli a2, a2, 3
+; RV32V-NEXT:    mv a3, a2
+; RV32V-NEXT:    slli a2, a2, 1
+; RV32V-NEXT:    add a2, a2, a3
+; RV32V-NEXT:    add a2, sp, a2
+; RV32V-NEXT:    addi a2, a2, 16
+; RV32V-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v24, v24, v16
+; RV32V-NEXT:    csrr a2, vlenb
+; RV32V-NEXT:    slli a2, a2, 3
+; RV32V-NEXT:    add a2, sp, a2
+; RV32V-NEXT:    addi a2, a2, 16
+; RV32V-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v24, v16
+; RV32V-NEXT:    vxor.vv v16, v16, v0
+; RV32V-NEXT:    csrr a2, vlenb
+; RV32V-NEXT:    slli a2, a2, 4
+; RV32V-NEXT:    mv a3, a2
+; RV32V-NEXT:    slli a2, a2, 1
+; RV32V-NEXT:    add a2, a2, a3
+; RV32V-NEXT:    add a2, sp, a2
+; RV32V-NEXT:    addi a2, a2, 16
+; RV32V-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v24, v16, v24
+; RV32V-NEXT:    csrr a2, vlenb
+; RV32V-NEXT:    slli a2, a2, 3
+; RV32V-NEXT:    mv a3, a2
+; RV32V-NEXT:    slli a2, a2, 2
+; RV32V-NEXT:    add a2, a2, a3
+; RV32V-NEXT:    add a2, sp, a2
+; RV32V-NEXT:    addi a2, a2, 16
+; RV32V-NEXT:    vl8r.v v0, (a2) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    csrr a2, vlenb
+; RV32V-NEXT:    slli a2, a2, 5
+; RV32V-NEXT:    add a2, sp, a2
+; RV32V-NEXT:    addi a2, a2, 16
+; RV32V-NEXT:    vl8r.v v0, (a2) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vxor.vv v8, v24, v8
+; RV32V-NEXT:    vsll.vi v16, v16, 4
+; RV32V-NEXT:    vsrl.vi v8, v8, 4
+; RV32V-NEXT:    vor.vv v8, v8, v16
+; RV32V-NEXT:    vsrl.vi v16, v8, 2
+; RV32V-NEXT:    vand.vx v8, v8, a0
+; RV32V-NEXT:    vand.vx v16, v16, a0
+; RV32V-NEXT:    vsll.vi v8, v8, 2
+; RV32V-NEXT:    vor.vv v8, v16, v8
+; RV32V-NEXT:    vsrl.vi v16, v8, 1
+; RV32V-NEXT:    vand.vx v8, v8, a1
+; RV32V-NEXT:    vand.vx v16, v16, a1
+; RV32V-NEXT:    vadd.vv v8, v8, v8
+; RV32V-NEXT:    vor.vv v8, v16, v8
+; RV32V-NEXT:    vsrl.vi v8, v8, 1
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add sp, sp, a0
+; RV32V-NEXT:    addi sp, sp, 16
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: clmulh_nxv64i8_vx:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    addi sp, sp, -16
+; RV64V-NEXT:    csrr a1, vlenb
+; RV64V-NEXT:    slli a1, a1, 3
+; RV64V-NEXT:    mv a2, a1
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    add a2, a2, a1
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    add a1, a1, a2
+; RV64V-NEXT:    sub sp, sp, a1
+; RV64V-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
+; RV64V-NEXT:    vmv.v.x v24, a0
+; RV64V-NEXT:    vsll.vi v16, v8, 4
+; RV64V-NEXT:    vsrl.vi v8, v8, 4
+; RV64V-NEXT:    li a0, 51
+; RV64V-NEXT:    li a1, 85
+; RV64V-NEXT:    li a2, 16
+; RV64V-NEXT:    li a3, 32
+; RV64V-NEXT:    vsll.vi v0, v24, 4
+; RV64V-NEXT:    vsrl.vi v24, v24, 4
+; RV64V-NEXT:    vor.vv v24, v24, v0
+; RV64V-NEXT:    vsrl.vi v0, v24, 2
+; RV64V-NEXT:    vand.vx v24, v24, a0
+; RV64V-NEXT:    vand.vx v0, v0, a0
+; RV64V-NEXT:    vsll.vi v24, v24, 2
+; RV64V-NEXT:    vor.vv v24, v0, v24
+; RV64V-NEXT:    vsrl.vi v0, v24, 1
+; RV64V-NEXT:    vand.vx v24, v24, a1
+; RV64V-NEXT:    vand.vx v0, v0, a1
+; RV64V-NEXT:    vadd.vv v24, v24, v24
+; RV64V-NEXT:    vor.vv v0, v0, v24
+; RV64V-NEXT:    vand.vx v24, v0, a2
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 4
+; RV64V-NEXT:    mv a4, a2
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    add a2, a2, a4
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 16
+; RV64V-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    li a2, 64
+; RV64V-NEXT:    vor.vv v8, v8, v16
+; RV64V-NEXT:    vsrl.vi v16, v8, 2
+; RV64V-NEXT:    vand.vx v8, v8, a0
+; RV64V-NEXT:    vand.vx v16, v16, a0
+; RV64V-NEXT:    vsll.vi v8, v8, 2
+; RV64V-NEXT:    vor.vv v8, v16, v8
+; RV64V-NEXT:    vsrl.vi v16, v8, 1
+; RV64V-NEXT:    vand.vx v8, v8, a1
+; RV64V-NEXT:    vand.vx v16, v16, a1
+; RV64V-NEXT:    vadd.vv v8, v8, v8
+; RV64V-NEXT:    vor.vv v8, v16, v8
+; RV64V-NEXT:    vand.vx v16, v0, a3
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    li a3, 128
+; RV64V-NEXT:    vand.vx v16, v0, a2
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 5
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 16
+; RV64V-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vi v16, v0, 2
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 3
+; RV64V-NEXT:    mv a4, a2
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    add a2, a2, a4
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 16
+; RV64V-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vi v24, v0, 1
+; RV64V-NEXT:    vand.vi v16, v0, 4
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 3
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 16
+; RV64V-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vi v16, v0, 8
+; RV64V-NEXT:    addi a2, sp, 16
+; RV64V-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v0, v0, a3
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 4
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 16
+; RV64V-NEXT:    vs8r.v v0, (a2) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 3
+; RV64V-NEXT:    mv a3, a2
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    add a2, a2, a3
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 16
+; RV64V-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 3
+; RV64V-NEXT:    mv a3, a2
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    add a2, a2, a3
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 16
+; RV64V-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vmul.vv v24, v8, v24
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 3
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 16
+; RV64V-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 3
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 16
+; RV64V-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    addi a2, sp, 16
+; RV64V-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v0, v8, v16
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 4
+; RV64V-NEXT:    mv a3, a2
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    add a2, a2, a3
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 16
+; RV64V-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 4
+; RV64V-NEXT:    mv a3, a2
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    add a2, a2, a3
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 16
+; RV64V-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 3
+; RV64V-NEXT:    mv a3, a2
+; RV64V-NEXT:    slli a2, a2, 2
+; RV64V-NEXT:    add a2, a2, a3
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 16
+; RV64V-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 3
+; RV64V-NEXT:    mv a3, a2
+; RV64V-NEXT:    slli a2, a2, 2
+; RV64V-NEXT:    add a2, a2, a3
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 16
+; RV64V-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 5
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 16
+; RV64V-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 5
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 16
+; RV64V-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 4
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 16
+; RV64V-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v8, v8, v16
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 3
+; RV64V-NEXT:    mv a3, a2
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    add a2, a2, a3
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 16
+; RV64V-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v24, v24, v16
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 3
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 16
+; RV64V-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v16, v24, v16
+; RV64V-NEXT:    vxor.vv v16, v16, v0
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 4
+; RV64V-NEXT:    mv a3, a2
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    add a2, a2, a3
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 16
+; RV64V-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v24, v16, v24
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 3
+; RV64V-NEXT:    mv a3, a2
+; RV64V-NEXT:    slli a2, a2, 2
+; RV64V-NEXT:    add a2, a2, a3
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 16
+; RV64V-NEXT:    vl8r.v v0, (a2) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 5
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 16
+; RV64V-NEXT:    vl8r.v v0, (a2) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vxor.vv v8, v24, v8
+; RV64V-NEXT:    vsll.vi v16, v16, 4
+; RV64V-NEXT:    vsrl.vi v8, v8, 4
+; RV64V-NEXT:    vor.vv v8, v8, v16
+; RV64V-NEXT:    vsrl.vi v16, v8, 2
+; RV64V-NEXT:    vand.vx v8, v8, a0
+; RV64V-NEXT:    vand.vx v16, v16, a0
+; RV64V-NEXT:    vsll.vi v8, v8, 2
+; RV64V-NEXT:    vor.vv v8, v16, v8
+; RV64V-NEXT:    vsrl.vi v16, v8, 1
+; RV64V-NEXT:    vand.vx v8, v8, a1
+; RV64V-NEXT:    vand.vx v16, v16, a1
+; RV64V-NEXT:    vadd.vv v8, v8, v8
+; RV64V-NEXT:    vor.vv v8, v16, v8
+; RV64V-NEXT:    vsrl.vi v8, v8, 1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add sp, sp, a0
+; RV64V-NEXT:    addi sp, sp, 16
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC64-LABEL: clmulh_nxv64i8_vx:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    addi sp, sp, -16
+; RV32ZVBC64-NEXT:    csrr a1, vlenb
+; RV32ZVBC64-NEXT:    slli a1, a1, 3
+; RV32ZVBC64-NEXT:    mv a2, a1
+; RV32ZVBC64-NEXT:    slli a1, a1, 1
+; RV32ZVBC64-NEXT:    add a2, a2, a1
+; RV32ZVBC64-NEXT:    slli a1, a1, 1
+; RV32ZVBC64-NEXT:    add a1, a1, a2
+; RV32ZVBC64-NEXT:    sub sp, sp, a1
+; RV32ZVBC64-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
+; RV32ZVBC64-NEXT:    vmv.v.x v24, a0
+; RV32ZVBC64-NEXT:    vsll.vi v16, v8, 4
+; RV32ZVBC64-NEXT:    vsrl.vi v8, v8, 4
+; RV32ZVBC64-NEXT:    li a0, 51
+; RV32ZVBC64-NEXT:    li a1, 85
+; RV32ZVBC64-NEXT:    li a2, 16
+; RV32ZVBC64-NEXT:    li a3, 32
+; RV32ZVBC64-NEXT:    vsll.vi v0, v24, 4
+; RV32ZVBC64-NEXT:    vsrl.vi v24, v24, 4
+; RV32ZVBC64-NEXT:    vor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vsrl.vi v0, v24, 2
+; RV32ZVBC64-NEXT:    vand.vx v24, v24, a0
+; RV32ZVBC64-NEXT:    vand.vx v0, v0, a0
+; RV32ZVBC64-NEXT:    vsll.vi v24, v24, 2
+; RV32ZVBC64-NEXT:    vor.vv v24, v0, v24
+; RV32ZVBC64-NEXT:    vsrl.vi v0, v24, 1
+; RV32ZVBC64-NEXT:    vand.vx v24, v24, a1
+; RV32ZVBC64-NEXT:    vand.vx v0, v0, a1
+; RV32ZVBC64-NEXT:    vadd.vv v24, v24, v24
+; RV32ZVBC64-NEXT:    vor.vv v0, v0, v24
+; RV32ZVBC64-NEXT:    vand.vx v24, v0, a2
+; RV32ZVBC64-NEXT:    csrr a2, vlenb
+; RV32ZVBC64-NEXT:    slli a2, a2, 4
+; RV32ZVBC64-NEXT:    mv a4, a2
+; RV32ZVBC64-NEXT:    slli a2, a2, 1
+; RV32ZVBC64-NEXT:    add a2, a2, a4
+; RV32ZVBC64-NEXT:    add a2, sp, a2
+; RV32ZVBC64-NEXT:    addi a2, a2, 16
+; RV32ZVBC64-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    li a2, 64
+; RV32ZVBC64-NEXT:    vor.vv v8, v8, v16
+; RV32ZVBC64-NEXT:    vsrl.vi v16, v8, 2
+; RV32ZVBC64-NEXT:    vand.vx v8, v8, a0
+; RV32ZVBC64-NEXT:    vand.vx v16, v16, a0
+; RV32ZVBC64-NEXT:    vsll.vi v8, v8, 2
+; RV32ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV32ZVBC64-NEXT:    vsrl.vi v16, v8, 1
+; RV32ZVBC64-NEXT:    vand.vx v8, v8, a1
+; RV32ZVBC64-NEXT:    vand.vx v16, v16, a1
+; RV32ZVBC64-NEXT:    vadd.vv v8, v8, v8
+; RV32ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV32ZVBC64-NEXT:    vand.vx v16, v0, a3
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 3
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 2
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    li a3, 128
+; RV32ZVBC64-NEXT:    vand.vx v16, v0, a2
+; RV32ZVBC64-NEXT:    csrr a2, vlenb
+; RV32ZVBC64-NEXT:    slli a2, a2, 5
+; RV32ZVBC64-NEXT:    add a2, sp, a2
+; RV32ZVBC64-NEXT:    addi a2, a2, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    vand.vi v16, v0, 2
+; RV32ZVBC64-NEXT:    csrr a2, vlenb
+; RV32ZVBC64-NEXT:    slli a2, a2, 3
+; RV32ZVBC64-NEXT:    mv a4, a2
+; RV32ZVBC64-NEXT:    slli a2, a2, 1
+; RV32ZVBC64-NEXT:    add a2, a2, a4
+; RV32ZVBC64-NEXT:    add a2, sp, a2
+; RV32ZVBC64-NEXT:    addi a2, a2, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    vand.vi v24, v0, 1
+; RV32ZVBC64-NEXT:    vand.vi v16, v0, 4
+; RV32ZVBC64-NEXT:    csrr a2, vlenb
+; RV32ZVBC64-NEXT:    slli a2, a2, 3
+; RV32ZVBC64-NEXT:    add a2, sp, a2
+; RV32ZVBC64-NEXT:    addi a2, a2, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    vand.vi v16, v0, 8
+; RV32ZVBC64-NEXT:    addi a2, sp, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    vand.vx v0, v0, a3
+; RV32ZVBC64-NEXT:    csrr a2, vlenb
+; RV32ZVBC64-NEXT:    slli a2, a2, 4
+; RV32ZVBC64-NEXT:    add a2, sp, a2
+; RV32ZVBC64-NEXT:    addi a2, a2, 16
+; RV32ZVBC64-NEXT:    vs8r.v v0, (a2) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    csrr a2, vlenb
+; RV32ZVBC64-NEXT:    slli a2, a2, 3
+; RV32ZVBC64-NEXT:    mv a3, a2
+; RV32ZVBC64-NEXT:    slli a2, a2, 1
+; RV32ZVBC64-NEXT:    add a2, a2, a3
+; RV32ZVBC64-NEXT:    add a2, sp, a2
+; RV32ZVBC64-NEXT:    addi a2, a2, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC64-NEXT:    csrr a2, vlenb
+; RV32ZVBC64-NEXT:    slli a2, a2, 3
+; RV32ZVBC64-NEXT:    mv a3, a2
+; RV32ZVBC64-NEXT:    slli a2, a2, 1
+; RV32ZVBC64-NEXT:    add a2, a2, a3
+; RV32ZVBC64-NEXT:    add a2, sp, a2
+; RV32ZVBC64-NEXT:    addi a2, a2, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    vmul.vv v24, v8, v24
+; RV32ZVBC64-NEXT:    csrr a2, vlenb
+; RV32ZVBC64-NEXT:    slli a2, a2, 3
+; RV32ZVBC64-NEXT:    add a2, sp, a2
+; RV32ZVBC64-NEXT:    addi a2, a2, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC64-NEXT:    csrr a2, vlenb
+; RV32ZVBC64-NEXT:    slli a2, a2, 3
+; RV32ZVBC64-NEXT:    add a2, sp, a2
+; RV32ZVBC64-NEXT:    addi a2, a2, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    addi a2, sp, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v16
+; RV32ZVBC64-NEXT:    csrr a2, vlenb
+; RV32ZVBC64-NEXT:    slli a2, a2, 4
+; RV32ZVBC64-NEXT:    mv a3, a2
+; RV32ZVBC64-NEXT:    slli a2, a2, 1
+; RV32ZVBC64-NEXT:    add a2, a2, a3
+; RV32ZVBC64-NEXT:    add a2, sp, a2
+; RV32ZVBC64-NEXT:    addi a2, a2, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC64-NEXT:    csrr a2, vlenb
+; RV32ZVBC64-NEXT:    slli a2, a2, 4
+; RV32ZVBC64-NEXT:    mv a3, a2
+; RV32ZVBC64-NEXT:    slli a2, a2, 1
+; RV32ZVBC64-NEXT:    add a2, a2, a3
+; RV32ZVBC64-NEXT:    add a2, sp, a2
+; RV32ZVBC64-NEXT:    addi a2, a2, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    csrr a2, vlenb
+; RV32ZVBC64-NEXT:    slli a2, a2, 3
+; RV32ZVBC64-NEXT:    mv a3, a2
+; RV32ZVBC64-NEXT:    slli a2, a2, 2
+; RV32ZVBC64-NEXT:    add a2, a2, a3
+; RV32ZVBC64-NEXT:    add a2, sp, a2
+; RV32ZVBC64-NEXT:    addi a2, a2, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC64-NEXT:    csrr a2, vlenb
+; RV32ZVBC64-NEXT:    slli a2, a2, 3
+; RV32ZVBC64-NEXT:    mv a3, a2
+; RV32ZVBC64-NEXT:    slli a2, a2, 2
+; RV32ZVBC64-NEXT:    add a2, a2, a3
+; RV32ZVBC64-NEXT:    add a2, sp, a2
+; RV32ZVBC64-NEXT:    addi a2, a2, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    csrr a2, vlenb
+; RV32ZVBC64-NEXT:    slli a2, a2, 5
+; RV32ZVBC64-NEXT:    add a2, sp, a2
+; RV32ZVBC64-NEXT:    addi a2, a2, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC64-NEXT:    csrr a2, vlenb
+; RV32ZVBC64-NEXT:    slli a2, a2, 5
+; RV32ZVBC64-NEXT:    add a2, sp, a2
+; RV32ZVBC64-NEXT:    addi a2, a2, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    csrr a2, vlenb
+; RV32ZVBC64-NEXT:    slli a2, a2, 4
+; RV32ZVBC64-NEXT:    add a2, sp, a2
+; RV32ZVBC64-NEXT:    addi a2, a2, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v8, v8, v16
+; RV32ZVBC64-NEXT:    csrr a2, vlenb
+; RV32ZVBC64-NEXT:    slli a2, a2, 3
+; RV32ZVBC64-NEXT:    mv a3, a2
+; RV32ZVBC64-NEXT:    slli a2, a2, 1
+; RV32ZVBC64-NEXT:    add a2, a2, a3
+; RV32ZVBC64-NEXT:    add a2, sp, a2
+; RV32ZVBC64-NEXT:    addi a2, a2, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v16
+; RV32ZVBC64-NEXT:    csrr a2, vlenb
+; RV32ZVBC64-NEXT:    slli a2, a2, 3
+; RV32ZVBC64-NEXT:    add a2, sp, a2
+; RV32ZVBC64-NEXT:    addi a2, a2, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vxor.vv v16, v24, v16
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v0
+; RV32ZVBC64-NEXT:    csrr a2, vlenb
+; RV32ZVBC64-NEXT:    slli a2, a2, 4
+; RV32ZVBC64-NEXT:    mv a3, a2
+; RV32ZVBC64-NEXT:    slli a2, a2, 1
+; RV32ZVBC64-NEXT:    add a2, a2, a3
+; RV32ZVBC64-NEXT:    add a2, sp, a2
+; RV32ZVBC64-NEXT:    addi a2, a2, 16
+; RV32ZVBC64-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vxor.vv v24, v16, v24
+; RV32ZVBC64-NEXT:    csrr a2, vlenb
+; RV32ZVBC64-NEXT:    slli a2, a2, 3
+; RV32ZVBC64-NEXT:    mv a3, a2
+; RV32ZVBC64-NEXT:    slli a2, a2, 2
+; RV32ZVBC64-NEXT:    add a2, a2, a3
+; RV32ZVBC64-NEXT:    add a2, sp, a2
+; RV32ZVBC64-NEXT:    addi a2, a2, 16
+; RV32ZVBC64-NEXT:    vl8r.v v0, (a2) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    csrr a2, vlenb
+; RV32ZVBC64-NEXT:    slli a2, a2, 5
+; RV32ZVBC64-NEXT:    add a2, sp, a2
+; RV32ZVBC64-NEXT:    addi a2, a2, 16
+; RV32ZVBC64-NEXT:    vl8r.v v0, (a2) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vxor.vv v8, v24, v8
+; RV32ZVBC64-NEXT:    vsll.vi v16, v16, 4
+; RV32ZVBC64-NEXT:    vsrl.vi v8, v8, 4
+; RV32ZVBC64-NEXT:    vor.vv v8, v8, v16
+; RV32ZVBC64-NEXT:    vsrl.vi v16, v8, 2
+; RV32ZVBC64-NEXT:    vand.vx v8, v8, a0
+; RV32ZVBC64-NEXT:    vand.vx v16, v16, a0
+; RV32ZVBC64-NEXT:    vsll.vi v8, v8, 2
+; RV32ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV32ZVBC64-NEXT:    vsrl.vi v16, v8, 1
+; RV32ZVBC64-NEXT:    vand.vx v8, v8, a1
+; RV32ZVBC64-NEXT:    vand.vx v16, v16, a1
+; RV32ZVBC64-NEXT:    vadd.vv v8, v8, v8
+; RV32ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV32ZVBC64-NEXT:    vsrl.vi v8, v8, 1
+; RV32ZVBC64-NEXT:    csrr a0, vlenb
+; RV32ZVBC64-NEXT:    slli a0, a0, 3
+; RV32ZVBC64-NEXT:    mv a1, a0
+; RV32ZVBC64-NEXT:    slli a0, a0, 1
+; RV32ZVBC64-NEXT:    add a1, a1, a0
+; RV32ZVBC64-NEXT:    slli a0, a0, 1
+; RV32ZVBC64-NEXT:    add a0, a0, a1
+; RV32ZVBC64-NEXT:    add sp, sp, a0
+; RV32ZVBC64-NEXT:    addi sp, sp, 16
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmulh_nxv64i8_vx:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    addi sp, sp, -16
+; RV64ZVBC64-NEXT:    csrr a1, vlenb
+; RV64ZVBC64-NEXT:    slli a1, a1, 3
+; RV64ZVBC64-NEXT:    mv a2, a1
+; RV64ZVBC64-NEXT:    slli a1, a1, 1
+; RV64ZVBC64-NEXT:    add a2, a2, a1
+; RV64ZVBC64-NEXT:    slli a1, a1, 1
+; RV64ZVBC64-NEXT:    add a1, a1, a2
+; RV64ZVBC64-NEXT:    sub sp, sp, a1
+; RV64ZVBC64-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
+; RV64ZVBC64-NEXT:    vmv.v.x v24, a0
+; RV64ZVBC64-NEXT:    vsll.vi v16, v8, 4
+; RV64ZVBC64-NEXT:    vsrl.vi v8, v8, 4
+; RV64ZVBC64-NEXT:    li a0, 51
+; RV64ZVBC64-NEXT:    li a1, 85
+; RV64ZVBC64-NEXT:    li a2, 16
+; RV64ZVBC64-NEXT:    li a3, 32
+; RV64ZVBC64-NEXT:    vsll.vi v0, v24, 4
+; RV64ZVBC64-NEXT:    vsrl.vi v24, v24, 4
+; RV64ZVBC64-NEXT:    vor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vsrl.vi v0, v24, 2
+; RV64ZVBC64-NEXT:    vand.vx v24, v24, a0
+; RV64ZVBC64-NEXT:    vand.vx v0, v0, a0
+; RV64ZVBC64-NEXT:    vsll.vi v24, v24, 2
+; RV64ZVBC64-NEXT:    vor.vv v24, v0, v24
+; RV64ZVBC64-NEXT:    vsrl.vi v0, v24, 1
+; RV64ZVBC64-NEXT:    vand.vx v24, v24, a1
+; RV64ZVBC64-NEXT:    vand.vx v0, v0, a1
+; RV64ZVBC64-NEXT:    vadd.vv v24, v24, v24
+; RV64ZVBC64-NEXT:    vor.vv v0, v0, v24
+; RV64ZVBC64-NEXT:    vand.vx v24, v0, a2
+; RV64ZVBC64-NEXT:    csrr a2, vlenb
+; RV64ZVBC64-NEXT:    slli a2, a2, 4
+; RV64ZVBC64-NEXT:    mv a4, a2
+; RV64ZVBC64-NEXT:    slli a2, a2, 1
+; RV64ZVBC64-NEXT:    add a2, a2, a4
+; RV64ZVBC64-NEXT:    add a2, sp, a2
+; RV64ZVBC64-NEXT:    addi a2, a2, 16
+; RV64ZVBC64-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    li a2, 64
+; RV64ZVBC64-NEXT:    vor.vv v8, v8, v16
+; RV64ZVBC64-NEXT:    vsrl.vi v16, v8, 2
+; RV64ZVBC64-NEXT:    vand.vx v8, v8, a0
+; RV64ZVBC64-NEXT:    vand.vx v16, v16, a0
+; RV64ZVBC64-NEXT:    vsll.vi v8, v8, 2
+; RV64ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV64ZVBC64-NEXT:    vsrl.vi v16, v8, 1
+; RV64ZVBC64-NEXT:    vand.vx v8, v8, a1
+; RV64ZVBC64-NEXT:    vand.vx v16, v16, a1
+; RV64ZVBC64-NEXT:    vadd.vv v8, v8, v8
+; RV64ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, a3
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 3
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 2
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    li a3, 128
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, a2
+; RV64ZVBC64-NEXT:    csrr a2, vlenb
+; RV64ZVBC64-NEXT:    slli a2, a2, 5
+; RV64ZVBC64-NEXT:    add a2, sp, a2
+; RV64ZVBC64-NEXT:    addi a2, a2, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vi v16, v0, 2
+; RV64ZVBC64-NEXT:    csrr a2, vlenb
+; RV64ZVBC64-NEXT:    slli a2, a2, 3
+; RV64ZVBC64-NEXT:    mv a4, a2
+; RV64ZVBC64-NEXT:    slli a2, a2, 1
+; RV64ZVBC64-NEXT:    add a2, a2, a4
+; RV64ZVBC64-NEXT:    add a2, sp, a2
+; RV64ZVBC64-NEXT:    addi a2, a2, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vi v24, v0, 1
+; RV64ZVBC64-NEXT:    vand.vi v16, v0, 4
+; RV64ZVBC64-NEXT:    csrr a2, vlenb
+; RV64ZVBC64-NEXT:    slli a2, a2, 3
+; RV64ZVBC64-NEXT:    add a2, sp, a2
+; RV64ZVBC64-NEXT:    addi a2, a2, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vi v16, v0, 8
+; RV64ZVBC64-NEXT:    addi a2, sp, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vx v0, v0, a3
+; RV64ZVBC64-NEXT:    csrr a2, vlenb
+; RV64ZVBC64-NEXT:    slli a2, a2, 4
+; RV64ZVBC64-NEXT:    add a2, sp, a2
+; RV64ZVBC64-NEXT:    addi a2, a2, 16
+; RV64ZVBC64-NEXT:    vs8r.v v0, (a2) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a2, vlenb
+; RV64ZVBC64-NEXT:    slli a2, a2, 3
+; RV64ZVBC64-NEXT:    mv a3, a2
+; RV64ZVBC64-NEXT:    slli a2, a2, 1
+; RV64ZVBC64-NEXT:    add a2, a2, a3
+; RV64ZVBC64-NEXT:    add a2, sp, a2
+; RV64ZVBC64-NEXT:    addi a2, a2, 16
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a2, vlenb
+; RV64ZVBC64-NEXT:    slli a2, a2, 3
+; RV64ZVBC64-NEXT:    mv a3, a2
+; RV64ZVBC64-NEXT:    slli a2, a2, 1
+; RV64ZVBC64-NEXT:    add a2, a2, a3
+; RV64ZVBC64-NEXT:    add a2, sp, a2
+; RV64ZVBC64-NEXT:    addi a2, a2, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vmul.vv v24, v8, v24
+; RV64ZVBC64-NEXT:    csrr a2, vlenb
+; RV64ZVBC64-NEXT:    slli a2, a2, 3
+; RV64ZVBC64-NEXT:    add a2, sp, a2
+; RV64ZVBC64-NEXT:    addi a2, a2, 16
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a2, vlenb
+; RV64ZVBC64-NEXT:    slli a2, a2, 3
+; RV64ZVBC64-NEXT:    add a2, sp, a2
+; RV64ZVBC64-NEXT:    addi a2, a2, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    addi a2, sp, 16
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v16
+; RV64ZVBC64-NEXT:    csrr a2, vlenb
+; RV64ZVBC64-NEXT:    slli a2, a2, 4
+; RV64ZVBC64-NEXT:    mv a3, a2
+; RV64ZVBC64-NEXT:    slli a2, a2, 1
+; RV64ZVBC64-NEXT:    add a2, a2, a3
+; RV64ZVBC64-NEXT:    add a2, sp, a2
+; RV64ZVBC64-NEXT:    addi a2, a2, 16
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a2, vlenb
+; RV64ZVBC64-NEXT:    slli a2, a2, 4
+; RV64ZVBC64-NEXT:    mv a3, a2
+; RV64ZVBC64-NEXT:    slli a2, a2, 1
+; RV64ZVBC64-NEXT:    add a2, a2, a3
+; RV64ZVBC64-NEXT:    add a2, sp, a2
+; RV64ZVBC64-NEXT:    addi a2, a2, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a2, vlenb
+; RV64ZVBC64-NEXT:    slli a2, a2, 3
+; RV64ZVBC64-NEXT:    mv a3, a2
+; RV64ZVBC64-NEXT:    slli a2, a2, 2
+; RV64ZVBC64-NEXT:    add a2, a2, a3
+; RV64ZVBC64-NEXT:    add a2, sp, a2
+; RV64ZVBC64-NEXT:    addi a2, a2, 16
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a2, vlenb
+; RV64ZVBC64-NEXT:    slli a2, a2, 3
+; RV64ZVBC64-NEXT:    mv a3, a2
+; RV64ZVBC64-NEXT:    slli a2, a2, 2
+; RV64ZVBC64-NEXT:    add a2, a2, a3
+; RV64ZVBC64-NEXT:    add a2, sp, a2
+; RV64ZVBC64-NEXT:    addi a2, a2, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a2, vlenb
+; RV64ZVBC64-NEXT:    slli a2, a2, 5
+; RV64ZVBC64-NEXT:    add a2, sp, a2
+; RV64ZVBC64-NEXT:    addi a2, a2, 16
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a2, vlenb
+; RV64ZVBC64-NEXT:    slli a2, a2, 5
+; RV64ZVBC64-NEXT:    add a2, sp, a2
+; RV64ZVBC64-NEXT:    addi a2, a2, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a2, vlenb
+; RV64ZVBC64-NEXT:    slli a2, a2, 4
+; RV64ZVBC64-NEXT:    add a2, sp, a2
+; RV64ZVBC64-NEXT:    addi a2, a2, 16
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v8, v8, v16
+; RV64ZVBC64-NEXT:    csrr a2, vlenb
+; RV64ZVBC64-NEXT:    slli a2, a2, 3
+; RV64ZVBC64-NEXT:    mv a3, a2
+; RV64ZVBC64-NEXT:    slli a2, a2, 1
+; RV64ZVBC64-NEXT:    add a2, a2, a3
+; RV64ZVBC64-NEXT:    add a2, sp, a2
+; RV64ZVBC64-NEXT:    addi a2, a2, 16
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v16
+; RV64ZVBC64-NEXT:    csrr a2, vlenb
+; RV64ZVBC64-NEXT:    slli a2, a2, 3
+; RV64ZVBC64-NEXT:    add a2, sp, a2
+; RV64ZVBC64-NEXT:    addi a2, a2, 16
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v16, v24, v16
+; RV64ZVBC64-NEXT:    vxor.vv v16, v16, v0
+; RV64ZVBC64-NEXT:    csrr a2, vlenb
+; RV64ZVBC64-NEXT:    slli a2, a2, 4
+; RV64ZVBC64-NEXT:    mv a3, a2
+; RV64ZVBC64-NEXT:    slli a2, a2, 1
+; RV64ZVBC64-NEXT:    add a2, a2, a3
+; RV64ZVBC64-NEXT:    add a2, sp, a2
+; RV64ZVBC64-NEXT:    addi a2, a2, 16
+; RV64ZVBC64-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v24, v16, v24
+; RV64ZVBC64-NEXT:    csrr a2, vlenb
+; RV64ZVBC64-NEXT:    slli a2, a2, 3
+; RV64ZVBC64-NEXT:    mv a3, a2
+; RV64ZVBC64-NEXT:    slli a2, a2, 2
+; RV64ZVBC64-NEXT:    add a2, a2, a3
+; RV64ZVBC64-NEXT:    add a2, sp, a2
+; RV64ZVBC64-NEXT:    addi a2, a2, 16
+; RV64ZVBC64-NEXT:    vl8r.v v0, (a2) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    csrr a2, vlenb
+; RV64ZVBC64-NEXT:    slli a2, a2, 5
+; RV64ZVBC64-NEXT:    add a2, sp, a2
+; RV64ZVBC64-NEXT:    addi a2, a2, 16
+; RV64ZVBC64-NEXT:    vl8r.v v0, (a2) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vxor.vv v8, v24, v8
+; RV64ZVBC64-NEXT:    vsll.vi v16, v16, 4
+; RV64ZVBC64-NEXT:    vsrl.vi v8, v8, 4
+; RV64ZVBC64-NEXT:    vor.vv v8, v8, v16
+; RV64ZVBC64-NEXT:    vsrl.vi v16, v8, 2
+; RV64ZVBC64-NEXT:    vand.vx v8, v8, a0
+; RV64ZVBC64-NEXT:    vand.vx v16, v16, a0
+; RV64ZVBC64-NEXT:    vsll.vi v8, v8, 2
+; RV64ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV64ZVBC64-NEXT:    vsrl.vi v16, v8, 1
+; RV64ZVBC64-NEXT:    vand.vx v8, v8, a1
+; RV64ZVBC64-NEXT:    vand.vx v16, v16, a1
+; RV64ZVBC64-NEXT:    vadd.vv v8, v8, v8
+; RV64ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV64ZVBC64-NEXT:    vsrl.vi v8, v8, 1
+; RV64ZVBC64-NEXT:    csrr a0, vlenb
+; RV64ZVBC64-NEXT:    slli a0, a0, 3
+; RV64ZVBC64-NEXT:    mv a1, a0
+; RV64ZVBC64-NEXT:    slli a0, a0, 1
+; RV64ZVBC64-NEXT:    add a1, a1, a0
+; RV64ZVBC64-NEXT:    slli a0, a0, 1
+; RV64ZVBC64-NEXT:    add a0, a0, a1
+; RV64ZVBC64-NEXT:    add sp, sp, a0
+; RV64ZVBC64-NEXT:    addi sp, sp, 16
+; RV64ZVBC64-NEXT:    ret
+;
+; RV32ZVBC32-LABEL: clmulh_nxv64i8_vx:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
+; RV32ZVBC32-NEXT:    vclmulh.vx v8, v8, a0
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmulh_nxv64i8_vx:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
+; RV64ZVBC32-NEXT:    vclmulh.vx v8, v8, a0
+; RV64ZVBC32-NEXT:    ret
+  %elt.head = insertelement <vscale x 64 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 64 x i8> %elt.head, <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer
+  %va.ext = zext <vscale x 64 x i8> %va to <vscale x 64 x i16>
+  %vb.ext = zext <vscale x 64 x i8> %vb to <vscale x 64 x i16>
+  %clmul = call <vscale x 64 x i16> @llvm.clmul.nxv64i16(<vscale x 64 x i16> %va.ext, <vscale x 64 x i16> %vb.ext)
+  %res.ext = lshr <vscale x 64 x i16> %clmul, splat(i16 8)
+  %res = trunc <vscale x 64 x i16> %res.ext to <vscale x 64 x i8>
+  ret <vscale x 64 x i8> %res
+}
+
+define <vscale x 1 x i16> @clmulh_nxv1i16_vv(<vscale x 1 x i16> %va, <vscale x 1 x i16> %vb) nounwind {
+; RV32V-LABEL: clmulh_nxv1i16_vv:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; RV32V-NEXT:    vzext.vf2 v10, v8
+; RV32V-NEXT:    vzext.vf2 v8, v9
 ; RV32V-NEXT:    li a0, 16
-; RV32V-NEXT:    vzext.vf2 v8, v10
-; RV32V-NEXT:    vand.vi v10, v8, 2
+; RV32V-NEXT:    vand.vi v9, v8, 2
 ; RV32V-NEXT:    vand.vi v11, v8, 1
-; RV32V-NEXT:    vmul.vv v10, v9, v10
-; RV32V-NEXT:    vmul.vv v11, v9, v11
-; RV32V-NEXT:    vxor.vv v10, v11, v10
+; RV32V-NEXT:    vmul.vv v9, v10, v9
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v11, v9
 ; RV32V-NEXT:    vand.vi v11, v8, 4
-; RV32V-NEXT:    vmul.vv v11, v9, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
 ; RV32V-NEXT:    vand.vi v11, v8, 8
-; RV32V-NEXT:    vmul.vv v11, v9, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
 ; RV32V-NEXT:    vand.vx v11, v8, a0
 ; RV32V-NEXT:    li a0, 32
-; RV32V-NEXT:    vmul.vv v11, v9, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
 ; RV32V-NEXT:    vand.vx v11, v8, a0
 ; RV32V-NEXT:    li a0, 64
-; RV32V-NEXT:    vmul.vv v11, v9, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
 ; RV32V-NEXT:    vand.vx v11, v8, a0
 ; RV32V-NEXT:    li a0, 128
-; RV32V-NEXT:    vmul.vv v11, v9, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
 ; RV32V-NEXT:    vand.vx v11, v8, a0
 ; RV32V-NEXT:    li a0, 256
-; RV32V-NEXT:    vmul.vv v11, v9, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
 ; RV32V-NEXT:    vand.vx v11, v8, a0
 ; RV32V-NEXT:    li a0, 512
-; RV32V-NEXT:    vmul.vv v11, v9, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
 ; RV32V-NEXT:    vand.vx v11, v8, a0
 ; RV32V-NEXT:    li a0, 1024
-; RV32V-NEXT:    vmul.vv v11, v9, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
 ; RV32V-NEXT:    vand.vx v11, v8, a0
 ; RV32V-NEXT:    li a0, 1
 ; RV32V-NEXT:    slli a0, a0, 11
-; RV32V-NEXT:    vmul.vv v11, v9, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
 ; RV32V-NEXT:    vand.vx v11, v8, a0
 ; RV32V-NEXT:    lui a0, 1
-; RV32V-NEXT:    vmul.vv v11, v9, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
 ; RV32V-NEXT:    vand.vx v11, v8, a0
 ; RV32V-NEXT:    lui a0, 2
-; RV32V-NEXT:    vmul.vv v11, v9, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
 ; RV32V-NEXT:    vand.vx v11, v8, a0
 ; RV32V-NEXT:    lui a0, 4
-; RV32V-NEXT:    vmul.vv v11, v9, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
 ; RV32V-NEXT:    vand.vx v11, v8, a0
 ; RV32V-NEXT:    lui a0, 8
 ; RV32V-NEXT:    vand.vx v8, v8, a0
-; RV32V-NEXT:    vmul.vv v11, v9, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    vxor.vv v8, v10, v8
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    vxor.vv v8, v9, v8
 ; RV32V-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; RV32V-NEXT:    vnsrl.wi v8, v8, 16
 ; RV32V-NEXT:    ret
 ;
-; RV64V-LABEL: clmulh_nxv1i16_vx:
+; RV64V-LABEL: clmulh_nxv1i16_vv:
 ; RV64V:       # %bb.0:
-; RV64V-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; RV64V-NEXT:    vmv.v.x v10, a0
-; RV64V-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; RV64V-NEXT:    vzext.vf2 v9, v8
+; RV64V-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; RV64V-NEXT:    vzext.vf2 v10, v8
+; RV64V-NEXT:    vzext.vf2 v8, v9
 ; RV64V-NEXT:    li a0, 16
-; RV64V-NEXT:    vzext.vf2 v8, v10
-; RV64V-NEXT:    vand.vi v10, v8, 2
+; RV64V-NEXT:    vand.vi v9, v8, 2
 ; RV64V-NEXT:    vand.vi v11, v8, 1
-; RV64V-NEXT:    vmul.vv v10, v9, v10
-; RV64V-NEXT:    vmul.vv v11, v9, v11
-; RV64V-NEXT:    vxor.vv v10, v11, v10
+; RV64V-NEXT:    vmul.vv v9, v10, v9
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v11, v9
 ; RV64V-NEXT:    vand.vi v11, v8, 4
-; RV64V-NEXT:    vmul.vv v11, v9, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
 ; RV64V-NEXT:    vand.vi v11, v8, 8
-; RV64V-NEXT:    vmul.vv v11, v9, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
 ; RV64V-NEXT:    vand.vx v11, v8, a0
 ; RV64V-NEXT:    li a0, 32
-; RV64V-NEXT:    vmul.vv v11, v9, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
 ; RV64V-NEXT:    vand.vx v11, v8, a0
 ; RV64V-NEXT:    li a0, 64
-; RV64V-NEXT:    vmul.vv v11, v9, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
 ; RV64V-NEXT:    vand.vx v11, v8, a0
 ; RV64V-NEXT:    li a0, 128
-; RV64V-NEXT:    vmul.vv v11, v9, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
 ; RV64V-NEXT:    vand.vx v11, v8, a0
 ; RV64V-NEXT:    li a0, 256
-; RV64V-NEXT:    vmul.vv v11, v9, v11
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    li a0, 512
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    li a0, 1024
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    li a0, 1
+; RV64V-NEXT:    slli a0, a0, 11
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    lui a0, 1
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    lui a0, 2
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    lui a0, 4
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    lui a0, 8
+; RV64V-NEXT:    vand.vx v8, v8, a0
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vmul.vv v8, v10, v8
+; RV64V-NEXT:    vxor.vv v8, v9, v8
+; RV64V-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; RV64V-NEXT:    vnsrl.wi v8, v8, 16
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC64-LABEL: clmulh_nxv1i16_vv:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf4 v10, v9
+; RV32ZVBC64-NEXT:    vzext.vf4 v9, v8
+; RV32ZVBC64-NEXT:    vclmul.vv v8, v9, v10
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v8, 0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v8, 16
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmulh_nxv1i16_vv:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf4 v10, v9
+; RV64ZVBC64-NEXT:    vzext.vf4 v9, v8
+; RV64ZVBC64-NEXT:    vclmul.vv v8, v9, v10
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v8, 0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v8, 16
+; RV64ZVBC64-NEXT:    ret
+;
+; RV32ZVBC32-LABEL: clmulh_nxv1i16_vv:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; RV32ZVBC32-NEXT:    vclmulh.vv v8, v8, v9
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmulh_nxv1i16_vv:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; RV64ZVBC32-NEXT:    vclmulh.vv v8, v8, v9
+; RV64ZVBC32-NEXT:    ret
+  %va.ext = zext <vscale x 1 x i16> %va to <vscale x 1 x i32>
+  %vb.ext = zext <vscale x 1 x i16> %vb to <vscale x 1 x i32>
+  %clmul = call <vscale x 1 x i32> @llvm.clmul.nxv1i32(<vscale x 1 x i32> %va.ext, <vscale x 1 x i32> %vb.ext)
+  %res.ext = lshr <vscale x 1 x i32> %clmul, splat(i32 16)
+  %res = trunc <vscale x 1 x i32> %res.ext to <vscale x 1 x i16>
+  ret <vscale x 1 x i16> %res
+}
+
+define <vscale x 1 x i16> @clmulh_nxv1i16_vx(<vscale x 1 x i16> %va, i16 %b) nounwind {
+; RV32V-LABEL: clmulh_nxv1i16_vx:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; RV32V-NEXT:    vmv.v.x v10, a0
+; RV32V-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV32V-NEXT:    vzext.vf2 v9, v8
+; RV32V-NEXT:    li a0, 16
+; RV32V-NEXT:    vzext.vf2 v8, v10
+; RV32V-NEXT:    vand.vi v10, v8, 2
+; RV32V-NEXT:    vand.vi v11, v8, 1
+; RV32V-NEXT:    vmul.vv v10, v9, v10
+; RV32V-NEXT:    vmul.vv v11, v9, v11
+; RV32V-NEXT:    vxor.vv v10, v11, v10
+; RV32V-NEXT:    vand.vi v11, v8, 4
+; RV32V-NEXT:    vmul.vv v11, v9, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vi v11, v8, 8
+; RV32V-NEXT:    vmul.vv v11, v9, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    li a0, 32
+; RV32V-NEXT:    vmul.vv v11, v9, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    li a0, 64
+; RV32V-NEXT:    vmul.vv v11, v9, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    li a0, 128
+; RV32V-NEXT:    vmul.vv v11, v9, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    li a0, 256
+; RV32V-NEXT:    vmul.vv v11, v9, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    li a0, 512
+; RV32V-NEXT:    vmul.vv v11, v9, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    li a0, 1024
+; RV32V-NEXT:    vmul.vv v11, v9, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    li a0, 1
+; RV32V-NEXT:    slli a0, a0, 11
+; RV32V-NEXT:    vmul.vv v11, v9, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    lui a0, 1
+; RV32V-NEXT:    vmul.vv v11, v9, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    lui a0, 2
+; RV32V-NEXT:    vmul.vv v11, v9, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    lui a0, 4
+; RV32V-NEXT:    vmul.vv v11, v9, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    lui a0, 8
+; RV32V-NEXT:    vand.vx v8, v8, a0
+; RV32V-NEXT:    vmul.vv v11, v9, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vmul.vv v8, v9, v8
+; RV32V-NEXT:    vxor.vv v8, v10, v8
+; RV32V-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; RV32V-NEXT:    vnsrl.wi v8, v8, 16
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: clmulh_nxv1i16_vx:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; RV64V-NEXT:    vmv.v.x v10, a0
+; RV64V-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV64V-NEXT:    vzext.vf2 v9, v8
+; RV64V-NEXT:    li a0, 16
+; RV64V-NEXT:    vzext.vf2 v8, v10
+; RV64V-NEXT:    vand.vi v10, v8, 2
+; RV64V-NEXT:    vand.vi v11, v8, 1
+; RV64V-NEXT:    vmul.vv v10, v9, v10
+; RV64V-NEXT:    vmul.vv v11, v9, v11
+; RV64V-NEXT:    vxor.vv v10, v11, v10
+; RV64V-NEXT:    vand.vi v11, v8, 4
+; RV64V-NEXT:    vmul.vv v11, v9, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vi v11, v8, 8
+; RV64V-NEXT:    vmul.vv v11, v9, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    li a0, 32
+; RV64V-NEXT:    vmul.vv v11, v9, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    li a0, 64
+; RV64V-NEXT:    vmul.vv v11, v9, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    li a0, 128
+; RV64V-NEXT:    vmul.vv v11, v9, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    li a0, 256
+; RV64V-NEXT:    vmul.vv v11, v9, v11
 ; RV64V-NEXT:    vxor.vv v10, v10, v11
 ; RV64V-NEXT:    vand.vx v11, v8, a0
 ; RV64V-NEXT:    li a0, 512
@@ -2671,143 +4020,13 @@ define <vscale x 1 x i16> @clmulh_nxv1i16_vx(<vscale x 1 x i16> %va, i16 %b) nou
 ; RV32ZVBC32-LABEL: clmulh_nxv1i16_vx:
 ; RV32ZVBC32:       # %bb.0:
 ; RV32ZVBC32-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; RV32ZVBC32-NEXT:    vmv.v.x v10, a0
-; RV32ZVBC32-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; RV32ZVBC32-NEXT:    vzext.vf2 v9, v8
-; RV32ZVBC32-NEXT:    li a0, 16
-; RV32ZVBC32-NEXT:    vzext.vf2 v8, v10
-; RV32ZVBC32-NEXT:    vand.vi v10, v8, 2
-; RV32ZVBC32-NEXT:    vand.vi v11, v8, 1
-; RV32ZVBC32-NEXT:    vmul.vv v10, v9, v10
-; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v11, v10
-; RV32ZVBC32-NEXT:    vand.vi v11, v8, 4
-; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vi v11, v8, 8
-; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV32ZVBC32-NEXT:    li a0, 32
-; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV32ZVBC32-NEXT:    li a0, 64
-; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV32ZVBC32-NEXT:    li a0, 128
-; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV32ZVBC32-NEXT:    li a0, 256
-; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV32ZVBC32-NEXT:    li a0, 512
-; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV32ZVBC32-NEXT:    li a0, 1024
-; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV32ZVBC32-NEXT:    li a0, 1
-; RV32ZVBC32-NEXT:    slli a0, a0, 11
-; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV32ZVBC32-NEXT:    lui a0, 1
-; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV32ZVBC32-NEXT:    lui a0, 2
-; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV32ZVBC32-NEXT:    lui a0, 4
-; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV32ZVBC32-NEXT:    lui a0, 8
-; RV32ZVBC32-NEXT:    vand.vx v8, v8, a0
-; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vmul.vv v8, v9, v8
-; RV32ZVBC32-NEXT:    vxor.vv v8, v10, v8
-; RV32ZVBC32-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; RV32ZVBC32-NEXT:    vnsrl.wi v8, v8, 16
+; RV32ZVBC32-NEXT:    vclmulh.vx v8, v8, a0
 ; RV32ZVBC32-NEXT:    ret
 ;
 ; RV64ZVBC32-LABEL: clmulh_nxv1i16_vx:
 ; RV64ZVBC32:       # %bb.0:
 ; RV64ZVBC32-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; RV64ZVBC32-NEXT:    vmv.v.x v10, a0
-; RV64ZVBC32-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; RV64ZVBC32-NEXT:    vzext.vf2 v9, v8
-; RV64ZVBC32-NEXT:    li a0, 16
-; RV64ZVBC32-NEXT:    vzext.vf2 v8, v10
-; RV64ZVBC32-NEXT:    vand.vi v10, v8, 2
-; RV64ZVBC32-NEXT:    vand.vi v11, v8, 1
-; RV64ZVBC32-NEXT:    vmul.vv v10, v9, v10
-; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v11, v10
-; RV64ZVBC32-NEXT:    vand.vi v11, v8, 4
-; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vi v11, v8, 8
-; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV64ZVBC32-NEXT:    li a0, 32
-; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV64ZVBC32-NEXT:    li a0, 64
-; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV64ZVBC32-NEXT:    li a0, 128
-; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV64ZVBC32-NEXT:    li a0, 256
-; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV64ZVBC32-NEXT:    li a0, 512
-; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV64ZVBC32-NEXT:    li a0, 1024
-; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV64ZVBC32-NEXT:    li a0, 1
-; RV64ZVBC32-NEXT:    slli a0, a0, 11
-; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV64ZVBC32-NEXT:    lui a0, 1
-; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV64ZVBC32-NEXT:    lui a0, 2
-; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV64ZVBC32-NEXT:    lui a0, 4
-; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV64ZVBC32-NEXT:    lui a0, 8
-; RV64ZVBC32-NEXT:    vand.vx v8, v8, a0
-; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vmul.vv v8, v9, v8
-; RV64ZVBC32-NEXT:    vxor.vv v8, v10, v8
-; RV64ZVBC32-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; RV64ZVBC32-NEXT:    vnsrl.wi v8, v8, 16
+; RV64ZVBC32-NEXT:    vclmulh.vx v8, v8, a0
 ; RV64ZVBC32-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x i16> poison, i16 %b, i32 0
   %vb = shufflevector <vscale x 1 x i16> %elt.head, <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer
@@ -2984,140 +4203,14 @@ define <vscale x 2 x i16> @clmulh_nxv2i16_vv(<vscale x 2 x i16> %va, <vscale x 2
 ;
 ; RV32ZVBC32-LABEL: clmulh_nxv2i16_vv:
 ; RV32ZVBC32:       # %bb.0:
-; RV32ZVBC32-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; RV32ZVBC32-NEXT:    vzext.vf2 v10, v8
-; RV32ZVBC32-NEXT:    vzext.vf2 v8, v9
-; RV32ZVBC32-NEXT:    li a0, 16
-; RV32ZVBC32-NEXT:    vand.vi v9, v8, 2
-; RV32ZVBC32-NEXT:    vand.vi v11, v8, 1
-; RV32ZVBC32-NEXT:    vmul.vv v9, v10, v9
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v11, v9
-; RV32ZVBC32-NEXT:    vand.vi v11, v8, 4
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV32ZVBC32-NEXT:    vand.vi v11, v8, 8
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV32ZVBC32-NEXT:    li a0, 32
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV32ZVBC32-NEXT:    li a0, 64
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV32ZVBC32-NEXT:    li a0, 128
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV32ZVBC32-NEXT:    li a0, 256
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV32ZVBC32-NEXT:    li a0, 512
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV32ZVBC32-NEXT:    li a0, 1024
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV32ZVBC32-NEXT:    li a0, 1
-; RV32ZVBC32-NEXT:    slli a0, a0, 11
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV32ZVBC32-NEXT:    lui a0, 1
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV32ZVBC32-NEXT:    lui a0, 2
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV32ZVBC32-NEXT:    lui a0, 4
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV32ZVBC32-NEXT:    lui a0, 8
-; RV32ZVBC32-NEXT:    vand.vx v8, v8, a0
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
-; RV32ZVBC32-NEXT:    vxor.vv v8, v9, v8
-; RV32ZVBC32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; RV32ZVBC32-NEXT:    vnsrl.wi v8, v8, 16
+; RV32ZVBC32-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; RV32ZVBC32-NEXT:    vclmulh.vv v8, v8, v9
 ; RV32ZVBC32-NEXT:    ret
 ;
 ; RV64ZVBC32-LABEL: clmulh_nxv2i16_vv:
 ; RV64ZVBC32:       # %bb.0:
-; RV64ZVBC32-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; RV64ZVBC32-NEXT:    vzext.vf2 v10, v8
-; RV64ZVBC32-NEXT:    vzext.vf2 v8, v9
-; RV64ZVBC32-NEXT:    li a0, 16
-; RV64ZVBC32-NEXT:    vand.vi v9, v8, 2
-; RV64ZVBC32-NEXT:    vand.vi v11, v8, 1
-; RV64ZVBC32-NEXT:    vmul.vv v9, v10, v9
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v11, v9
-; RV64ZVBC32-NEXT:    vand.vi v11, v8, 4
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vi v11, v8, 8
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV64ZVBC32-NEXT:    li a0, 32
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV64ZVBC32-NEXT:    li a0, 64
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV64ZVBC32-NEXT:    li a0, 128
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV64ZVBC32-NEXT:    li a0, 256
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV64ZVBC32-NEXT:    li a0, 512
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV64ZVBC32-NEXT:    li a0, 1024
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV64ZVBC32-NEXT:    li a0, 1
-; RV64ZVBC32-NEXT:    slli a0, a0, 11
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV64ZVBC32-NEXT:    lui a0, 1
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV64ZVBC32-NEXT:    lui a0, 2
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV64ZVBC32-NEXT:    lui a0, 4
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV64ZVBC32-NEXT:    lui a0, 8
-; RV64ZVBC32-NEXT:    vand.vx v8, v8, a0
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vmul.vv v8, v10, v8
-; RV64ZVBC32-NEXT:    vxor.vv v8, v9, v8
-; RV64ZVBC32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; RV64ZVBC32-NEXT:    vnsrl.wi v8, v8, 16
+; RV64ZVBC32-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; RV64ZVBC32-NEXT:    vclmulh.vv v8, v8, v9
 ; RV64ZVBC32-NEXT:    ret
   %va.ext = zext <vscale x 2 x i16> %va to <vscale x 2 x i32>
   %vb.ext = zext <vscale x 2 x i16> %vb to <vscale x 2 x i32>
@@ -3301,143 +4394,13 @@ define <vscale x 2 x i16> @clmulh_nxv2i16_vx(<vscale x 2 x i16> %va, i16 %b) nou
 ; RV32ZVBC32-LABEL: clmulh_nxv2i16_vx:
 ; RV32ZVBC32:       # %bb.0:
 ; RV32ZVBC32-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; RV32ZVBC32-NEXT:    vmv.v.x v10, a0
-; RV32ZVBC32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV32ZVBC32-NEXT:    vzext.vf2 v9, v8
-; RV32ZVBC32-NEXT:    li a0, 16
-; RV32ZVBC32-NEXT:    vzext.vf2 v8, v10
-; RV32ZVBC32-NEXT:    vand.vi v10, v8, 2
-; RV32ZVBC32-NEXT:    vand.vi v11, v8, 1
-; RV32ZVBC32-NEXT:    vmul.vv v10, v9, v10
-; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v11, v10
-; RV32ZVBC32-NEXT:    vand.vi v11, v8, 4
-; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vi v11, v8, 8
-; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV32ZVBC32-NEXT:    li a0, 32
-; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV32ZVBC32-NEXT:    li a0, 64
-; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV32ZVBC32-NEXT:    li a0, 128
-; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV32ZVBC32-NEXT:    li a0, 256
-; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV32ZVBC32-NEXT:    li a0, 512
-; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV32ZVBC32-NEXT:    li a0, 1024
-; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV32ZVBC32-NEXT:    li a0, 1
-; RV32ZVBC32-NEXT:    slli a0, a0, 11
-; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV32ZVBC32-NEXT:    lui a0, 1
-; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV32ZVBC32-NEXT:    lui a0, 2
-; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV32ZVBC32-NEXT:    lui a0, 4
-; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV32ZVBC32-NEXT:    lui a0, 8
-; RV32ZVBC32-NEXT:    vand.vx v8, v8, a0
-; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vmul.vv v8, v9, v8
-; RV32ZVBC32-NEXT:    vxor.vv v8, v10, v8
-; RV32ZVBC32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; RV32ZVBC32-NEXT:    vnsrl.wi v8, v8, 16
+; RV32ZVBC32-NEXT:    vclmulh.vx v8, v8, a0
 ; RV32ZVBC32-NEXT:    ret
 ;
 ; RV64ZVBC32-LABEL: clmulh_nxv2i16_vx:
 ; RV64ZVBC32:       # %bb.0:
 ; RV64ZVBC32-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; RV64ZVBC32-NEXT:    vmv.v.x v10, a0
-; RV64ZVBC32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVBC32-NEXT:    vzext.vf2 v9, v8
-; RV64ZVBC32-NEXT:    li a0, 16
-; RV64ZVBC32-NEXT:    vzext.vf2 v8, v10
-; RV64ZVBC32-NEXT:    vand.vi v10, v8, 2
-; RV64ZVBC32-NEXT:    vand.vi v11, v8, 1
-; RV64ZVBC32-NEXT:    vmul.vv v10, v9, v10
-; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v11, v10
-; RV64ZVBC32-NEXT:    vand.vi v11, v8, 4
-; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vi v11, v8, 8
-; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV64ZVBC32-NEXT:    li a0, 32
-; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV64ZVBC32-NEXT:    li a0, 64
-; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV64ZVBC32-NEXT:    li a0, 128
-; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV64ZVBC32-NEXT:    li a0, 256
-; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV64ZVBC32-NEXT:    li a0, 512
-; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV64ZVBC32-NEXT:    li a0, 1024
-; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV64ZVBC32-NEXT:    li a0, 1
-; RV64ZVBC32-NEXT:    slli a0, a0, 11
-; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV64ZVBC32-NEXT:    lui a0, 1
-; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV64ZVBC32-NEXT:    lui a0, 2
-; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV64ZVBC32-NEXT:    lui a0, 4
-; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV64ZVBC32-NEXT:    lui a0, 8
-; RV64ZVBC32-NEXT:    vand.vx v8, v8, a0
-; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vmul.vv v8, v9, v8
-; RV64ZVBC32-NEXT:    vxor.vv v8, v10, v8
-; RV64ZVBC32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; RV64ZVBC32-NEXT:    vnsrl.wi v8, v8, 16
+; RV64ZVBC32-NEXT:    vclmulh.vx v8, v8, a0
 ; RV64ZVBC32-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x i16> poison, i16 %b, i32 0
   %vb = shufflevector <vscale x 2 x i16> %elt.head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
@@ -3614,140 +4577,14 @@ define <vscale x 4 x i16> @clmulh_nxv4i16_vv(<vscale x 4 x i16> %va, <vscale x 4
 ;
 ; RV32ZVBC32-LABEL: clmulh_nxv4i16_vv:
 ; RV32ZVBC32:       # %bb.0:
-; RV32ZVBC32-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; RV32ZVBC32-NEXT:    vzext.vf2 v10, v8
-; RV32ZVBC32-NEXT:    vzext.vf2 v12, v9
-; RV32ZVBC32-NEXT:    li a0, 16
-; RV32ZVBC32-NEXT:    vand.vi v8, v12, 2
-; RV32ZVBC32-NEXT:    vand.vi v14, v12, 1
-; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
-; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV32ZVBC32-NEXT:    vxor.vv v8, v14, v8
-; RV32ZVBC32-NEXT:    vand.vi v14, v12, 4
-; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV32ZVBC32-NEXT:    vand.vi v14, v12, 8
-; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v12, a0
-; RV32ZVBC32-NEXT:    li a0, 32
-; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v12, a0
-; RV32ZVBC32-NEXT:    li a0, 64
-; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v12, a0
-; RV32ZVBC32-NEXT:    li a0, 128
-; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v12, a0
-; RV32ZVBC32-NEXT:    li a0, 256
-; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v12, a0
-; RV32ZVBC32-NEXT:    li a0, 512
-; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v12, a0
-; RV32ZVBC32-NEXT:    li a0, 1024
-; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v12, a0
-; RV32ZVBC32-NEXT:    li a0, 1
-; RV32ZVBC32-NEXT:    slli a0, a0, 11
-; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v12, a0
-; RV32ZVBC32-NEXT:    lui a0, 1
-; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v12, a0
-; RV32ZVBC32-NEXT:    lui a0, 2
-; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v12, a0
-; RV32ZVBC32-NEXT:    lui a0, 4
-; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v12, a0
-; RV32ZVBC32-NEXT:    lui a0, 8
-; RV32ZVBC32-NEXT:    vand.vx v12, v12, a0
-; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV32ZVBC32-NEXT:    vmul.vv v10, v10, v12
-; RV32ZVBC32-NEXT:    vxor.vv v10, v8, v10
-; RV32ZVBC32-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; RV32ZVBC32-NEXT:    vnsrl.wi v8, v10, 16
+; RV32ZVBC32-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; RV32ZVBC32-NEXT:    vclmulh.vv v8, v8, v9
 ; RV32ZVBC32-NEXT:    ret
 ;
 ; RV64ZVBC32-LABEL: clmulh_nxv4i16_vv:
 ; RV64ZVBC32:       # %bb.0:
-; RV64ZVBC32-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; RV64ZVBC32-NEXT:    vzext.vf2 v10, v8
-; RV64ZVBC32-NEXT:    vzext.vf2 v12, v9
-; RV64ZVBC32-NEXT:    li a0, 16
-; RV64ZVBC32-NEXT:    vand.vi v8, v12, 2
-; RV64ZVBC32-NEXT:    vand.vi v14, v12, 1
-; RV64ZVBC32-NEXT:    vmul.vv v8, v10, v8
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v8, v14, v8
-; RV64ZVBC32-NEXT:    vand.vi v14, v12, 4
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV64ZVBC32-NEXT:    vand.vi v14, v12, 8
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v12, a0
-; RV64ZVBC32-NEXT:    li a0, 32
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v12, a0
-; RV64ZVBC32-NEXT:    li a0, 64
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v12, a0
-; RV64ZVBC32-NEXT:    li a0, 128
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v12, a0
-; RV64ZVBC32-NEXT:    li a0, 256
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v12, a0
-; RV64ZVBC32-NEXT:    li a0, 512
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v12, a0
-; RV64ZVBC32-NEXT:    li a0, 1024
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v12, a0
-; RV64ZVBC32-NEXT:    li a0, 1
-; RV64ZVBC32-NEXT:    slli a0, a0, 11
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v12, a0
-; RV64ZVBC32-NEXT:    lui a0, 1
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v12, a0
-; RV64ZVBC32-NEXT:    lui a0, 2
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v12, a0
-; RV64ZVBC32-NEXT:    lui a0, 4
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v12, a0
-; RV64ZVBC32-NEXT:    lui a0, 8
-; RV64ZVBC32-NEXT:    vand.vx v12, v12, a0
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV64ZVBC32-NEXT:    vmul.vv v10, v10, v12
-; RV64ZVBC32-NEXT:    vxor.vv v10, v8, v10
-; RV64ZVBC32-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; RV64ZVBC32-NEXT:    vnsrl.wi v8, v10, 16
+; RV64ZVBC32-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; RV64ZVBC32-NEXT:    vclmulh.vv v8, v8, v9
 ; RV64ZVBC32-NEXT:    ret
   %va.ext = zext <vscale x 4 x i16> %va to <vscale x 4 x i32>
   %vb.ext = zext <vscale x 4 x i16> %vb to <vscale x 4 x i32>
@@ -3931,143 +4768,13 @@ define <vscale x 4 x i16> @clmulh_nxv4i16_vx(<vscale x 4 x i16> %va, i16 %b) nou
 ; RV32ZVBC32-LABEL: clmulh_nxv4i16_vx:
 ; RV32ZVBC32:       # %bb.0:
 ; RV32ZVBC32-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; RV32ZVBC32-NEXT:    vmv.v.x v12, a0
-; RV32ZVBC32-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV32ZVBC32-NEXT:    vzext.vf2 v10, v8
-; RV32ZVBC32-NEXT:    li a0, 16
-; RV32ZVBC32-NEXT:    vzext.vf2 v8, v12
-; RV32ZVBC32-NEXT:    vand.vi v12, v8, 2
-; RV32ZVBC32-NEXT:    vand.vi v14, v8, 1
-; RV32ZVBC32-NEXT:    vmul.vv v12, v10, v12
-; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v14, v12
-; RV32ZVBC32-NEXT:    vand.vi v14, v8, 4
-; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vi v14, v8, 8
-; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v8, a0
-; RV32ZVBC32-NEXT:    li a0, 32
-; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v8, a0
-; RV32ZVBC32-NEXT:    li a0, 64
-; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v8, a0
-; RV32ZVBC32-NEXT:    li a0, 128
-; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v8, a0
-; RV32ZVBC32-NEXT:    li a0, 256
-; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v8, a0
-; RV32ZVBC32-NEXT:    li a0, 512
-; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v8, a0
-; RV32ZVBC32-NEXT:    li a0, 1024
-; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v8, a0
-; RV32ZVBC32-NEXT:    li a0, 1
-; RV32ZVBC32-NEXT:    slli a0, a0, 11
-; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v8, a0
-; RV32ZVBC32-NEXT:    lui a0, 1
-; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v8, a0
-; RV32ZVBC32-NEXT:    lui a0, 2
-; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v8, a0
-; RV32ZVBC32-NEXT:    lui a0, 4
-; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v8, a0
-; RV32ZVBC32-NEXT:    lui a0, 8
-; RV32ZVBC32-NEXT:    vand.vx v8, v8, a0
-; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
-; RV32ZVBC32-NEXT:    vxor.vv v10, v12, v8
-; RV32ZVBC32-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; RV32ZVBC32-NEXT:    vnsrl.wi v8, v10, 16
+; RV32ZVBC32-NEXT:    vclmulh.vx v8, v8, a0
 ; RV32ZVBC32-NEXT:    ret
 ;
 ; RV64ZVBC32-LABEL: clmulh_nxv4i16_vx:
 ; RV64ZVBC32:       # %bb.0:
 ; RV64ZVBC32-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; RV64ZVBC32-NEXT:    vmv.v.x v12, a0
-; RV64ZVBC32-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV64ZVBC32-NEXT:    vzext.vf2 v10, v8
-; RV64ZVBC32-NEXT:    li a0, 16
-; RV64ZVBC32-NEXT:    vzext.vf2 v8, v12
-; RV64ZVBC32-NEXT:    vand.vi v12, v8, 2
-; RV64ZVBC32-NEXT:    vand.vi v14, v8, 1
-; RV64ZVBC32-NEXT:    vmul.vv v12, v10, v12
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v14, v12
-; RV64ZVBC32-NEXT:    vand.vi v14, v8, 4
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vi v14, v8, 8
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v8, a0
-; RV64ZVBC32-NEXT:    li a0, 32
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v8, a0
-; RV64ZVBC32-NEXT:    li a0, 64
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v8, a0
-; RV64ZVBC32-NEXT:    li a0, 128
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v8, a0
-; RV64ZVBC32-NEXT:    li a0, 256
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v8, a0
-; RV64ZVBC32-NEXT:    li a0, 512
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v8, a0
-; RV64ZVBC32-NEXT:    li a0, 1024
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v8, a0
-; RV64ZVBC32-NEXT:    li a0, 1
-; RV64ZVBC32-NEXT:    slli a0, a0, 11
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v8, a0
-; RV64ZVBC32-NEXT:    lui a0, 1
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v8, a0
-; RV64ZVBC32-NEXT:    lui a0, 2
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v8, a0
-; RV64ZVBC32-NEXT:    lui a0, 4
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v8, a0
-; RV64ZVBC32-NEXT:    lui a0, 8
-; RV64ZVBC32-NEXT:    vand.vx v8, v8, a0
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vmul.vv v8, v10, v8
-; RV64ZVBC32-NEXT:    vxor.vv v10, v12, v8
-; RV64ZVBC32-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; RV64ZVBC32-NEXT:    vnsrl.wi v8, v10, 16
+; RV64ZVBC32-NEXT:    vclmulh.vx v8, v8, a0
 ; RV64ZVBC32-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x i16> poison, i16 %b, i32 0
   %vb = shufflevector <vscale x 4 x i16> %elt.head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
@@ -4244,140 +4951,14 @@ define <vscale x 8 x i16> @clmulh_nxv8i16_vv(<vscale x 8 x i16> %va, <vscale x 8
 ;
 ; RV32ZVBC32-LABEL: clmulh_nxv8i16_vv:
 ; RV32ZVBC32:       # %bb.0:
-; RV32ZVBC32-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
-; RV32ZVBC32-NEXT:    vzext.vf2 v12, v8
-; RV32ZVBC32-NEXT:    vzext.vf2 v16, v10
-; RV32ZVBC32-NEXT:    li a0, 16
-; RV32ZVBC32-NEXT:    vand.vi v8, v16, 2
-; RV32ZVBC32-NEXT:    vand.vi v20, v16, 1
-; RV32ZVBC32-NEXT:    vmul.vv v8, v12, v8
-; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV32ZVBC32-NEXT:    vxor.vv v8, v20, v8
-; RV32ZVBC32-NEXT:    vand.vi v20, v16, 4
-; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v20
-; RV32ZVBC32-NEXT:    vand.vi v20, v16, 8
-; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v20
-; RV32ZVBC32-NEXT:    vand.vx v20, v16, a0
-; RV32ZVBC32-NEXT:    li a0, 32
-; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v20
-; RV32ZVBC32-NEXT:    vand.vx v20, v16, a0
-; RV32ZVBC32-NEXT:    li a0, 64
-; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v20
-; RV32ZVBC32-NEXT:    vand.vx v20, v16, a0
-; RV32ZVBC32-NEXT:    li a0, 128
-; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v20
-; RV32ZVBC32-NEXT:    vand.vx v20, v16, a0
-; RV32ZVBC32-NEXT:    li a0, 256
-; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v20
-; RV32ZVBC32-NEXT:    vand.vx v20, v16, a0
-; RV32ZVBC32-NEXT:    li a0, 512
-; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v20
-; RV32ZVBC32-NEXT:    vand.vx v20, v16, a0
-; RV32ZVBC32-NEXT:    li a0, 1024
-; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v20
-; RV32ZVBC32-NEXT:    vand.vx v20, v16, a0
-; RV32ZVBC32-NEXT:    li a0, 1
-; RV32ZVBC32-NEXT:    slli a0, a0, 11
-; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v20
-; RV32ZVBC32-NEXT:    vand.vx v20, v16, a0
-; RV32ZVBC32-NEXT:    lui a0, 1
-; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v20
-; RV32ZVBC32-NEXT:    vand.vx v20, v16, a0
-; RV32ZVBC32-NEXT:    lui a0, 2
-; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v20
-; RV32ZVBC32-NEXT:    vand.vx v20, v16, a0
-; RV32ZVBC32-NEXT:    lui a0, 4
-; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v20
-; RV32ZVBC32-NEXT:    vand.vx v20, v16, a0
-; RV32ZVBC32-NEXT:    lui a0, 8
-; RV32ZVBC32-NEXT:    vand.vx v16, v16, a0
-; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v20
-; RV32ZVBC32-NEXT:    vmul.vv v12, v12, v16
-; RV32ZVBC32-NEXT:    vxor.vv v12, v8, v12
-; RV32ZVBC32-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV32ZVBC32-NEXT:    vnsrl.wi v8, v12, 16
+; RV32ZVBC32-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; RV32ZVBC32-NEXT:    vclmulh.vv v8, v8, v10
 ; RV32ZVBC32-NEXT:    ret
 ;
 ; RV64ZVBC32-LABEL: clmulh_nxv8i16_vv:
 ; RV64ZVBC32:       # %bb.0:
-; RV64ZVBC32-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
-; RV64ZVBC32-NEXT:    vzext.vf2 v12, v8
-; RV64ZVBC32-NEXT:    vzext.vf2 v16, v10
-; RV64ZVBC32-NEXT:    li a0, 16
-; RV64ZVBC32-NEXT:    vand.vi v8, v16, 2
-; RV64ZVBC32-NEXT:    vand.vi v20, v16, 1
-; RV64ZVBC32-NEXT:    vmul.vv v8, v12, v8
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v8, v20, v8
-; RV64ZVBC32-NEXT:    vand.vi v20, v16, 4
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
-; RV64ZVBC32-NEXT:    vand.vi v20, v16, 8
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v16, a0
-; RV64ZVBC32-NEXT:    li a0, 32
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v16, a0
-; RV64ZVBC32-NEXT:    li a0, 64
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v16, a0
-; RV64ZVBC32-NEXT:    li a0, 128
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v16, a0
-; RV64ZVBC32-NEXT:    li a0, 256
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v16, a0
-; RV64ZVBC32-NEXT:    li a0, 512
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v16, a0
-; RV64ZVBC32-NEXT:    li a0, 1024
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v16, a0
-; RV64ZVBC32-NEXT:    li a0, 1
-; RV64ZVBC32-NEXT:    slli a0, a0, 11
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v16, a0
-; RV64ZVBC32-NEXT:    lui a0, 1
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v16, a0
-; RV64ZVBC32-NEXT:    lui a0, 2
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v16, a0
-; RV64ZVBC32-NEXT:    lui a0, 4
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v16, a0
-; RV64ZVBC32-NEXT:    lui a0, 8
-; RV64ZVBC32-NEXT:    vand.vx v16, v16, a0
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
-; RV64ZVBC32-NEXT:    vmul.vv v12, v12, v16
-; RV64ZVBC32-NEXT:    vxor.vv v12, v8, v12
-; RV64ZVBC32-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV64ZVBC32-NEXT:    vnsrl.wi v8, v12, 16
+; RV64ZVBC32-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; RV64ZVBC32-NEXT:    vclmulh.vv v8, v8, v10
 ; RV64ZVBC32-NEXT:    ret
   %va.ext = zext <vscale x 8 x i16> %va to <vscale x 8 x i32>
   %vb.ext = zext <vscale x 8 x i16> %vb to <vscale x 8 x i32>
@@ -4561,143 +5142,13 @@ define <vscale x 8 x i16> @clmulh_nxv8i16_vx(<vscale x 8 x i16> %va, i16 %b) nou
 ; RV32ZVBC32-LABEL: clmulh_nxv8i16_vx:
 ; RV32ZVBC32:       # %bb.0:
 ; RV32ZVBC32-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; RV32ZVBC32-NEXT:    vmv.v.x v16, a0
-; RV32ZVBC32-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; RV32ZVBC32-NEXT:    vzext.vf2 v12, v8
-; RV32ZVBC32-NEXT:    li a0, 16
-; RV32ZVBC32-NEXT:    vzext.vf2 v8, v16
-; RV32ZVBC32-NEXT:    vand.vi v16, v8, 2
-; RV32ZVBC32-NEXT:    vand.vi v20, v8, 1
-; RV32ZVBC32-NEXT:    vmul.vv v16, v12, v16
-; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV32ZVBC32-NEXT:    vxor.vv v16, v20, v16
-; RV32ZVBC32-NEXT:    vand.vi v20, v8, 4
-; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV32ZVBC32-NEXT:    vand.vi v20, v8, 8
-; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV32ZVBC32-NEXT:    vand.vx v20, v8, a0
-; RV32ZVBC32-NEXT:    li a0, 32
-; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV32ZVBC32-NEXT:    vand.vx v20, v8, a0
-; RV32ZVBC32-NEXT:    li a0, 64
-; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV32ZVBC32-NEXT:    vand.vx v20, v8, a0
-; RV32ZVBC32-NEXT:    li a0, 128
-; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV32ZVBC32-NEXT:    vand.vx v20, v8, a0
-; RV32ZVBC32-NEXT:    li a0, 256
-; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV32ZVBC32-NEXT:    vand.vx v20, v8, a0
-; RV32ZVBC32-NEXT:    li a0, 512
-; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV32ZVBC32-NEXT:    vand.vx v20, v8, a0
-; RV32ZVBC32-NEXT:    li a0, 1024
-; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV32ZVBC32-NEXT:    vand.vx v20, v8, a0
-; RV32ZVBC32-NEXT:    li a0, 1
-; RV32ZVBC32-NEXT:    slli a0, a0, 11
-; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV32ZVBC32-NEXT:    vand.vx v20, v8, a0
-; RV32ZVBC32-NEXT:    lui a0, 1
-; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV32ZVBC32-NEXT:    vand.vx v20, v8, a0
-; RV32ZVBC32-NEXT:    lui a0, 2
-; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV32ZVBC32-NEXT:    vand.vx v20, v8, a0
-; RV32ZVBC32-NEXT:    lui a0, 4
-; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV32ZVBC32-NEXT:    vand.vx v20, v8, a0
-; RV32ZVBC32-NEXT:    lui a0, 8
-; RV32ZVBC32-NEXT:    vand.vx v8, v8, a0
-; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV32ZVBC32-NEXT:    vmul.vv v8, v12, v8
-; RV32ZVBC32-NEXT:    vxor.vv v12, v16, v8
-; RV32ZVBC32-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV32ZVBC32-NEXT:    vnsrl.wi v8, v12, 16
+; RV32ZVBC32-NEXT:    vclmulh.vx v8, v8, a0
 ; RV32ZVBC32-NEXT:    ret
 ;
 ; RV64ZVBC32-LABEL: clmulh_nxv8i16_vx:
 ; RV64ZVBC32:       # %bb.0:
 ; RV64ZVBC32-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; RV64ZVBC32-NEXT:    vmv.v.x v16, a0
-; RV64ZVBC32-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; RV64ZVBC32-NEXT:    vzext.vf2 v12, v8
-; RV64ZVBC32-NEXT:    li a0, 16
-; RV64ZVBC32-NEXT:    vzext.vf2 v8, v16
-; RV64ZVBC32-NEXT:    vand.vi v16, v8, 2
-; RV64ZVBC32-NEXT:    vand.vi v20, v8, 1
-; RV64ZVBC32-NEXT:    vmul.vv v16, v12, v16
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v20, v16
-; RV64ZVBC32-NEXT:    vand.vi v20, v8, 4
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vi v20, v8, 8
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v8, a0
-; RV64ZVBC32-NEXT:    li a0, 32
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v8, a0
-; RV64ZVBC32-NEXT:    li a0, 64
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v8, a0
-; RV64ZVBC32-NEXT:    li a0, 128
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v8, a0
-; RV64ZVBC32-NEXT:    li a0, 256
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v8, a0
-; RV64ZVBC32-NEXT:    li a0, 512
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v8, a0
-; RV64ZVBC32-NEXT:    li a0, 1024
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v8, a0
-; RV64ZVBC32-NEXT:    li a0, 1
-; RV64ZVBC32-NEXT:    slli a0, a0, 11
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v8, a0
-; RV64ZVBC32-NEXT:    lui a0, 1
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v8, a0
-; RV64ZVBC32-NEXT:    lui a0, 2
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v8, a0
-; RV64ZVBC32-NEXT:    lui a0, 4
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v8, a0
-; RV64ZVBC32-NEXT:    lui a0, 8
-; RV64ZVBC32-NEXT:    vand.vx v8, v8, a0
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vmul.vv v8, v12, v8
-; RV64ZVBC32-NEXT:    vxor.vv v12, v16, v8
-; RV64ZVBC32-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV64ZVBC32-NEXT:    vnsrl.wi v8, v12, 16
+; RV64ZVBC32-NEXT:    vclmulh.vx v8, v8, a0
 ; RV64ZVBC32-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x i16> poison, i16 %b, i32 0
   %vb = shufflevector <vscale x 8 x i16> %elt.head, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
@@ -4710,74 +5161,293 @@ define <vscale x 8 x i16> @clmulh_nxv8i16_vx(<vscale x 8 x i16> %va, i16 %b) nou
 }
 
 define <vscale x 16 x i16> @clmulh_nxv16i16_vv(<vscale x 16 x i16> %va, <vscale x 16 x i16> %vb) nounwind {
-; CHECK-LABEL: clmulh_nxv16i16_vv:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vzext.vf2 v16, v8
-; CHECK-NEXT:    vzext.vf2 v24, v12
-; CHECK-NEXT:    li a0, 16
-; CHECK-NEXT:    vand.vi v8, v24, 2
-; CHECK-NEXT:    vand.vi v0, v24, 1
-; CHECK-NEXT:    vmul.vv v8, v16, v8
-; CHECK-NEXT:    vmul.vv v0, v16, v0
-; CHECK-NEXT:    vxor.vv v8, v0, v8
-; CHECK-NEXT:    vand.vi v0, v24, 4
-; CHECK-NEXT:    vmul.vv v0, v16, v0
-; CHECK-NEXT:    vxor.vv v8, v8, v0
-; CHECK-NEXT:    vand.vi v0, v24, 8
-; CHECK-NEXT:    vmul.vv v0, v16, v0
-; CHECK-NEXT:    vxor.vv v8, v8, v0
-; CHECK-NEXT:    vand.vx v0, v24, a0
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vmul.vv v0, v16, v0
-; CHECK-NEXT:    vxor.vv v8, v8, v0
-; CHECK-NEXT:    vand.vx v0, v24, a0
-; CHECK-NEXT:    li a0, 64
-; CHECK-NEXT:    vmul.vv v0, v16, v0
-; CHECK-NEXT:    vxor.vv v8, v8, v0
-; CHECK-NEXT:    vand.vx v0, v24, a0
-; CHECK-NEXT:    li a0, 128
-; CHECK-NEXT:    vmul.vv v0, v16, v0
-; CHECK-NEXT:    vxor.vv v8, v8, v0
-; CHECK-NEXT:    vand.vx v0, v24, a0
-; CHECK-NEXT:    li a0, 256
-; CHECK-NEXT:    vmul.vv v0, v16, v0
-; CHECK-NEXT:    vxor.vv v8, v8, v0
-; CHECK-NEXT:    vand.vx v0, v24, a0
-; CHECK-NEXT:    li a0, 512
-; CHECK-NEXT:    vmul.vv v0, v16, v0
-; CHECK-NEXT:    vxor.vv v8, v8, v0
-; CHECK-NEXT:    vand.vx v0, v24, a0
-; CHECK-NEXT:    li a0, 1024
-; CHECK-NEXT:    vmul.vv v0, v16, v0
-; CHECK-NEXT:    vxor.vv v8, v8, v0
-; CHECK-NEXT:    vand.vx v0, v24, a0
-; CHECK-NEXT:    li a0, 1
-; CHECK-NEXT:    slli a0, a0, 11
-; CHECK-NEXT:    vmul.vv v0, v16, v0
-; CHECK-NEXT:    vxor.vv v8, v8, v0
-; CHECK-NEXT:    vand.vx v0, v24, a0
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    vmul.vv v0, v16, v0
-; CHECK-NEXT:    vxor.vv v8, v8, v0
-; CHECK-NEXT:    vand.vx v0, v24, a0
-; CHECK-NEXT:    lui a0, 2
-; CHECK-NEXT:    vmul.vv v0, v16, v0
-; CHECK-NEXT:    vxor.vv v8, v8, v0
-; CHECK-NEXT:    vand.vx v0, v24, a0
-; CHECK-NEXT:    lui a0, 4
-; CHECK-NEXT:    vmul.vv v0, v16, v0
-; CHECK-NEXT:    vxor.vv v8, v8, v0
-; CHECK-NEXT:    vand.vx v0, v24, a0
-; CHECK-NEXT:    lui a0, 8
-; CHECK-NEXT:    vand.vx v24, v24, a0
-; CHECK-NEXT:    vmul.vv v0, v16, v0
-; CHECK-NEXT:    vxor.vv v8, v8, v0
-; CHECK-NEXT:    vmul.vv v16, v16, v24
-; CHECK-NEXT:    vxor.vv v16, v8, v16
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v16, 16
-; CHECK-NEXT:    ret
+; RV32V-LABEL: clmulh_nxv16i16_vv:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; RV32V-NEXT:    vzext.vf2 v16, v8
+; RV32V-NEXT:    vzext.vf2 v24, v12
+; RV32V-NEXT:    li a0, 16
+; RV32V-NEXT:    vand.vi v8, v24, 2
+; RV32V-NEXT:    vand.vi v0, v24, 1
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v0, v8
+; RV32V-NEXT:    vand.vi v0, v24, 4
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    vand.vi v0, v24, 8
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    vand.vx v0, v24, a0
+; RV32V-NEXT:    li a0, 32
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    vand.vx v0, v24, a0
+; RV32V-NEXT:    li a0, 64
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    vand.vx v0, v24, a0
+; RV32V-NEXT:    li a0, 128
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    vand.vx v0, v24, a0
+; RV32V-NEXT:    li a0, 256
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    vand.vx v0, v24, a0
+; RV32V-NEXT:    li a0, 512
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    vand.vx v0, v24, a0
+; RV32V-NEXT:    li a0, 1024
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    vand.vx v0, v24, a0
+; RV32V-NEXT:    li a0, 1
+; RV32V-NEXT:    slli a0, a0, 11
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    vand.vx v0, v24, a0
+; RV32V-NEXT:    lui a0, 1
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    vand.vx v0, v24, a0
+; RV32V-NEXT:    lui a0, 2
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    vand.vx v0, v24, a0
+; RV32V-NEXT:    lui a0, 4
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    vand.vx v0, v24, a0
+; RV32V-NEXT:    lui a0, 8
+; RV32V-NEXT:    vand.vx v24, v24, a0
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    vmul.vv v16, v16, v24
+; RV32V-NEXT:    vxor.vv v16, v8, v16
+; RV32V-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; RV32V-NEXT:    vnsrl.wi v8, v16, 16
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: clmulh_nxv16i16_vv:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; RV64V-NEXT:    vzext.vf2 v16, v8
+; RV64V-NEXT:    vzext.vf2 v24, v12
+; RV64V-NEXT:    li a0, 16
+; RV64V-NEXT:    vand.vi v8, v24, 2
+; RV64V-NEXT:    vand.vi v0, v24, 1
+; RV64V-NEXT:    vmul.vv v8, v16, v8
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v0, v8
+; RV64V-NEXT:    vand.vi v0, v24, 4
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vand.vi v0, v24, 8
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vand.vx v0, v24, a0
+; RV64V-NEXT:    li a0, 32
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vand.vx v0, v24, a0
+; RV64V-NEXT:    li a0, 64
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vand.vx v0, v24, a0
+; RV64V-NEXT:    li a0, 128
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vand.vx v0, v24, a0
+; RV64V-NEXT:    li a0, 256
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vand.vx v0, v24, a0
+; RV64V-NEXT:    li a0, 512
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vand.vx v0, v24, a0
+; RV64V-NEXT:    li a0, 1024
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vand.vx v0, v24, a0
+; RV64V-NEXT:    li a0, 1
+; RV64V-NEXT:    slli a0, a0, 11
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vand.vx v0, v24, a0
+; RV64V-NEXT:    lui a0, 1
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vand.vx v0, v24, a0
+; RV64V-NEXT:    lui a0, 2
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vand.vx v0, v24, a0
+; RV64V-NEXT:    lui a0, 4
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vand.vx v0, v24, a0
+; RV64V-NEXT:    lui a0, 8
+; RV64V-NEXT:    vand.vx v24, v24, a0
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vmul.vv v16, v16, v24
+; RV64V-NEXT:    vxor.vv v16, v8, v16
+; RV64V-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; RV64V-NEXT:    vnsrl.wi v8, v16, 16
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC64-LABEL: clmulh_nxv16i16_vv:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf2 v16, v8
+; RV32ZVBC64-NEXT:    vzext.vf2 v24, v12
+; RV32ZVBC64-NEXT:    li a0, 16
+; RV32ZVBC64-NEXT:    vand.vi v8, v24, 2
+; RV32ZVBC64-NEXT:    vand.vi v0, v24, 1
+; RV32ZVBC64-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC64-NEXT:    vxor.vv v8, v0, v8
+; RV32ZVBC64-NEXT:    vand.vi v0, v24, 4
+; RV32ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC64-NEXT:    vxor.vv v8, v8, v0
+; RV32ZVBC64-NEXT:    vand.vi v0, v24, 8
+; RV32ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC64-NEXT:    vxor.vv v8, v8, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v24, a0
+; RV32ZVBC64-NEXT:    li a0, 32
+; RV32ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC64-NEXT:    vxor.vv v8, v8, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v24, a0
+; RV32ZVBC64-NEXT:    li a0, 64
+; RV32ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC64-NEXT:    vxor.vv v8, v8, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v24, a0
+; RV32ZVBC64-NEXT:    li a0, 128
+; RV32ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC64-NEXT:    vxor.vv v8, v8, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v24, a0
+; RV32ZVBC64-NEXT:    li a0, 256
+; RV32ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC64-NEXT:    vxor.vv v8, v8, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v24, a0
+; RV32ZVBC64-NEXT:    li a0, 512
+; RV32ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC64-NEXT:    vxor.vv v8, v8, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v24, a0
+; RV32ZVBC64-NEXT:    li a0, 1024
+; RV32ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC64-NEXT:    vxor.vv v8, v8, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v24, a0
+; RV32ZVBC64-NEXT:    li a0, 1
+; RV32ZVBC64-NEXT:    slli a0, a0, 11
+; RV32ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC64-NEXT:    vxor.vv v8, v8, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v24, a0
+; RV32ZVBC64-NEXT:    lui a0, 1
+; RV32ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC64-NEXT:    vxor.vv v8, v8, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v24, a0
+; RV32ZVBC64-NEXT:    lui a0, 2
+; RV32ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC64-NEXT:    vxor.vv v8, v8, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v24, a0
+; RV32ZVBC64-NEXT:    lui a0, 4
+; RV32ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC64-NEXT:    vxor.vv v8, v8, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v24, a0
+; RV32ZVBC64-NEXT:    lui a0, 8
+; RV32ZVBC64-NEXT:    vand.vx v24, v24, a0
+; RV32ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC64-NEXT:    vxor.vv v8, v8, v0
+; RV32ZVBC64-NEXT:    vmul.vv v16, v16, v24
+; RV32ZVBC64-NEXT:    vxor.vv v16, v8, v16
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v16, 16
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmulh_nxv16i16_vv:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf2 v16, v8
+; RV64ZVBC64-NEXT:    vzext.vf2 v24, v12
+; RV64ZVBC64-NEXT:    li a0, 16
+; RV64ZVBC64-NEXT:    vand.vi v8, v24, 2
+; RV64ZVBC64-NEXT:    vand.vi v0, v24, 1
+; RV64ZVBC64-NEXT:    vmul.vv v8, v16, v8
+; RV64ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC64-NEXT:    vxor.vv v8, v0, v8
+; RV64ZVBC64-NEXT:    vand.vi v0, v24, 4
+; RV64ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC64-NEXT:    vxor.vv v8, v8, v0
+; RV64ZVBC64-NEXT:    vand.vi v0, v24, 8
+; RV64ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC64-NEXT:    vxor.vv v8, v8, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v24, a0
+; RV64ZVBC64-NEXT:    li a0, 32
+; RV64ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC64-NEXT:    vxor.vv v8, v8, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v24, a0
+; RV64ZVBC64-NEXT:    li a0, 64
+; RV64ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC64-NEXT:    vxor.vv v8, v8, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v24, a0
+; RV64ZVBC64-NEXT:    li a0, 128
+; RV64ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC64-NEXT:    vxor.vv v8, v8, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v24, a0
+; RV64ZVBC64-NEXT:    li a0, 256
+; RV64ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC64-NEXT:    vxor.vv v8, v8, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v24, a0
+; RV64ZVBC64-NEXT:    li a0, 512
+; RV64ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC64-NEXT:    vxor.vv v8, v8, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v24, a0
+; RV64ZVBC64-NEXT:    li a0, 1024
+; RV64ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC64-NEXT:    vxor.vv v8, v8, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v24, a0
+; RV64ZVBC64-NEXT:    li a0, 1
+; RV64ZVBC64-NEXT:    slli a0, a0, 11
+; RV64ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC64-NEXT:    vxor.vv v8, v8, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v24, a0
+; RV64ZVBC64-NEXT:    lui a0, 1
+; RV64ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC64-NEXT:    vxor.vv v8, v8, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v24, a0
+; RV64ZVBC64-NEXT:    lui a0, 2
+; RV64ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC64-NEXT:    vxor.vv v8, v8, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v24, a0
+; RV64ZVBC64-NEXT:    lui a0, 4
+; RV64ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC64-NEXT:    vxor.vv v8, v8, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v24, a0
+; RV64ZVBC64-NEXT:    lui a0, 8
+; RV64ZVBC64-NEXT:    vand.vx v24, v24, a0
+; RV64ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC64-NEXT:    vxor.vv v8, v8, v0
+; RV64ZVBC64-NEXT:    vmul.vv v16, v16, v24
+; RV64ZVBC64-NEXT:    vxor.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v16, 16
+; RV64ZVBC64-NEXT:    ret
+;
+; RV32ZVBC32-LABEL: clmulh_nxv16i16_vv:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; RV32ZVBC32-NEXT:    vclmulh.vv v8, v8, v12
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmulh_nxv16i16_vv:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; RV64ZVBC32-NEXT:    vclmulh.vv v8, v8, v12
+; RV64ZVBC32-NEXT:    ret
   %va.ext = zext <vscale x 16 x i16> %va to <vscale x 16 x i32>
   %vb.ext = zext <vscale x 16 x i16> %vb to <vscale x 16 x i32>
   %clmul = call <vscale x 16 x i32> @llvm.clmul.nxv16i32(<vscale x 16 x i32> %va.ext, <vscale x 16 x i32> %vb.ext)
@@ -4787,76 +5457,301 @@ define <vscale x 16 x i16> @clmulh_nxv16i16_vv(<vscale x 16 x i16> %va, <vscale
 }
 
 define <vscale x 16 x i16> @clmulh_nxv16i16_vx(<vscale x 16 x i16> %va, i16 %b) nounwind {
-; CHECK-LABEL: clmulh_nxv16i16_vx:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vmv.v.x v24, a0
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vzext.vf2 v16, v8
-; CHECK-NEXT:    li a0, 16
-; CHECK-NEXT:    vzext.vf2 v8, v24
-; CHECK-NEXT:    vand.vi v24, v8, 2
-; CHECK-NEXT:    vand.vi v0, v8, 1
-; CHECK-NEXT:    vmul.vv v24, v16, v24
-; CHECK-NEXT:    vmul.vv v0, v16, v0
-; CHECK-NEXT:    vxor.vv v24, v0, v24
-; CHECK-NEXT:    vand.vi v0, v8, 4
-; CHECK-NEXT:    vmul.vv v0, v16, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vi v0, v8, 8
-; CHECK-NEXT:    vmul.vv v0, v16, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v8, a0
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vmul.vv v0, v16, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v8, a0
-; CHECK-NEXT:    li a0, 64
-; CHECK-NEXT:    vmul.vv v0, v16, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v8, a0
-; CHECK-NEXT:    li a0, 128
-; CHECK-NEXT:    vmul.vv v0, v16, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v8, a0
-; CHECK-NEXT:    li a0, 256
-; CHECK-NEXT:    vmul.vv v0, v16, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v8, a0
-; CHECK-NEXT:    li a0, 512
-; CHECK-NEXT:    vmul.vv v0, v16, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v8, a0
-; CHECK-NEXT:    li a0, 1024
-; CHECK-NEXT:    vmul.vv v0, v16, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v8, a0
-; CHECK-NEXT:    li a0, 1
-; CHECK-NEXT:    slli a0, a0, 11
-; CHECK-NEXT:    vmul.vv v0, v16, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v8, a0
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    vmul.vv v0, v16, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v8, a0
-; CHECK-NEXT:    lui a0, 2
-; CHECK-NEXT:    vmul.vv v0, v16, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v8, a0
-; CHECK-NEXT:    lui a0, 4
-; CHECK-NEXT:    vmul.vv v0, v16, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v8, a0
-; CHECK-NEXT:    lui a0, 8
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    vmul.vv v0, v16, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vmul.vv v8, v16, v8
-; CHECK-NEXT:    vxor.vv v16, v24, v8
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v16, 16
-; CHECK-NEXT:    ret
+; RV32V-LABEL: clmulh_nxv16i16_vx:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; RV32V-NEXT:    vmv.v.x v24, a0
+; RV32V-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; RV32V-NEXT:    vzext.vf2 v16, v8
+; RV32V-NEXT:    li a0, 16
+; RV32V-NEXT:    vzext.vf2 v8, v24
+; RV32V-NEXT:    vand.vi v24, v8, 2
+; RV32V-NEXT:    vand.vi v0, v8, 1
+; RV32V-NEXT:    vmul.vv v24, v16, v24
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v24, v0, v24
+; RV32V-NEXT:    vand.vi v0, v8, 4
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vi v0, v8, 8
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v8, a0
+; RV32V-NEXT:    li a0, 32
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v8, a0
+; RV32V-NEXT:    li a0, 64
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v8, a0
+; RV32V-NEXT:    li a0, 128
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v8, a0
+; RV32V-NEXT:    li a0, 256
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v8, a0
+; RV32V-NEXT:    li a0, 512
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v8, a0
+; RV32V-NEXT:    li a0, 1024
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v8, a0
+; RV32V-NEXT:    li a0, 1
+; RV32V-NEXT:    slli a0, a0, 11
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v8, a0
+; RV32V-NEXT:    lui a0, 1
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v8, a0
+; RV32V-NEXT:    lui a0, 2
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v8, a0
+; RV32V-NEXT:    lui a0, 4
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v8, a0
+; RV32V-NEXT:    lui a0, 8
+; RV32V-NEXT:    vand.vx v8, v8, a0
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    vxor.vv v16, v24, v8
+; RV32V-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; RV32V-NEXT:    vnsrl.wi v8, v16, 16
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: clmulh_nxv16i16_vx:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; RV64V-NEXT:    vmv.v.x v24, a0
+; RV64V-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; RV64V-NEXT:    vzext.vf2 v16, v8
+; RV64V-NEXT:    li a0, 16
+; RV64V-NEXT:    vzext.vf2 v8, v24
+; RV64V-NEXT:    vand.vi v24, v8, 2
+; RV64V-NEXT:    vand.vi v0, v8, 1
+; RV64V-NEXT:    vmul.vv v24, v16, v24
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v24, v0, v24
+; RV64V-NEXT:    vand.vi v0, v8, 4
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vi v0, v8, 8
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v8, a0
+; RV64V-NEXT:    li a0, 32
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v8, a0
+; RV64V-NEXT:    li a0, 64
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v8, a0
+; RV64V-NEXT:    li a0, 128
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v8, a0
+; RV64V-NEXT:    li a0, 256
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v8, a0
+; RV64V-NEXT:    li a0, 512
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v8, a0
+; RV64V-NEXT:    li a0, 1024
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v8, a0
+; RV64V-NEXT:    li a0, 1
+; RV64V-NEXT:    slli a0, a0, 11
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v8, a0
+; RV64V-NEXT:    lui a0, 1
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v8, a0
+; RV64V-NEXT:    lui a0, 2
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v8, a0
+; RV64V-NEXT:    lui a0, 4
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v8, a0
+; RV64V-NEXT:    lui a0, 8
+; RV64V-NEXT:    vand.vx v8, v8, a0
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vmul.vv v8, v16, v8
+; RV64V-NEXT:    vxor.vv v16, v24, v8
+; RV64V-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; RV64V-NEXT:    vnsrl.wi v8, v16, 16
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC64-LABEL: clmulh_nxv16i16_vx:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; RV32ZVBC64-NEXT:    vmv.v.x v24, a0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf2 v16, v8
+; RV32ZVBC64-NEXT:    li a0, 16
+; RV32ZVBC64-NEXT:    vzext.vf2 v8, v24
+; RV32ZVBC64-NEXT:    vand.vi v24, v8, 2
+; RV32ZVBC64-NEXT:    vand.vi v0, v8, 1
+; RV32ZVBC64-NEXT:    vmul.vv v24, v16, v24
+; RV32ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v0, v24
+; RV32ZVBC64-NEXT:    vand.vi v0, v8, 4
+; RV32ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vi v0, v8, 8
+; RV32ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v8, a0
+; RV32ZVBC64-NEXT:    li a0, 32
+; RV32ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v8, a0
+; RV32ZVBC64-NEXT:    li a0, 64
+; RV32ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v8, a0
+; RV32ZVBC64-NEXT:    li a0, 128
+; RV32ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v8, a0
+; RV32ZVBC64-NEXT:    li a0, 256
+; RV32ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v8, a0
+; RV32ZVBC64-NEXT:    li a0, 512
+; RV32ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v8, a0
+; RV32ZVBC64-NEXT:    li a0, 1024
+; RV32ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v8, a0
+; RV32ZVBC64-NEXT:    li a0, 1
+; RV32ZVBC64-NEXT:    slli a0, a0, 11
+; RV32ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v8, a0
+; RV32ZVBC64-NEXT:    lui a0, 1
+; RV32ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v8, a0
+; RV32ZVBC64-NEXT:    lui a0, 2
+; RV32ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v8, a0
+; RV32ZVBC64-NEXT:    lui a0, 4
+; RV32ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vand.vx v0, v8, a0
+; RV32ZVBC64-NEXT:    lui a0, 8
+; RV32ZVBC64-NEXT:    vand.vx v8, v8, a0
+; RV32ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vmul.vv v8, v16, v8
+; RV32ZVBC64-NEXT:    vxor.vv v16, v24, v8
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wi v8, v16, 16
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmulh_nxv16i16_vx:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; RV64ZVBC64-NEXT:    vmv.v.x v24, a0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf2 v16, v8
+; RV64ZVBC64-NEXT:    li a0, 16
+; RV64ZVBC64-NEXT:    vzext.vf2 v8, v24
+; RV64ZVBC64-NEXT:    vand.vi v24, v8, 2
+; RV64ZVBC64-NEXT:    vand.vi v0, v8, 1
+; RV64ZVBC64-NEXT:    vmul.vv v24, v16, v24
+; RV64ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v0, v24
+; RV64ZVBC64-NEXT:    vand.vi v0, v8, 4
+; RV64ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vi v0, v8, 8
+; RV64ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v8, a0
+; RV64ZVBC64-NEXT:    li a0, 32
+; RV64ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v8, a0
+; RV64ZVBC64-NEXT:    li a0, 64
+; RV64ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v8, a0
+; RV64ZVBC64-NEXT:    li a0, 128
+; RV64ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v8, a0
+; RV64ZVBC64-NEXT:    li a0, 256
+; RV64ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v8, a0
+; RV64ZVBC64-NEXT:    li a0, 512
+; RV64ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v8, a0
+; RV64ZVBC64-NEXT:    li a0, 1024
+; RV64ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v8, a0
+; RV64ZVBC64-NEXT:    li a0, 1
+; RV64ZVBC64-NEXT:    slli a0, a0, 11
+; RV64ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v8, a0
+; RV64ZVBC64-NEXT:    lui a0, 1
+; RV64ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v8, a0
+; RV64ZVBC64-NEXT:    lui a0, 2
+; RV64ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v8, a0
+; RV64ZVBC64-NEXT:    lui a0, 4
+; RV64ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vand.vx v0, v8, a0
+; RV64ZVBC64-NEXT:    lui a0, 8
+; RV64ZVBC64-NEXT:    vand.vx v8, v8, a0
+; RV64ZVBC64-NEXT:    vmul.vv v0, v16, v0
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vmul.vv v8, v16, v8
+; RV64ZVBC64-NEXT:    vxor.vv v16, v24, v8
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wi v8, v16, 16
+; RV64ZVBC64-NEXT:    ret
+;
+; RV32ZVBC32-LABEL: clmulh_nxv16i16_vx:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; RV32ZVBC32-NEXT:    vclmulh.vx v8, v8, a0
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmulh_nxv16i16_vx:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; RV64ZVBC32-NEXT:    vclmulh.vx v8, v8, a0
+; RV64ZVBC32-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x i16> poison, i16 %b, i32 0
   %vb = shufflevector <vscale x 16 x i16> %elt.head, <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer
   %va.ext = zext <vscale x 16 x i16> %va to <vscale x 16 x i32>
@@ -4868,10275 +5763,16093 @@ define <vscale x 16 x i16> @clmulh_nxv16i16_vx(<vscale x 16 x i16> %va, i16 %b)
 }
 
 define <vscale x 32 x i16> @clmulh_nxv32i16_vv(<vscale x 32 x i16> %va, <vscale x 32 x i16> %vb) nounwind {
-; CHECK-LABEL: clmulh_nxv32i16_vv:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 7
-; CHECK-NEXT:    sub sp, sp, a0
-; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vsrl.vi v24, v8, 8
-; CHECK-NEXT:    vsll.vi v8, v8, 8
-; CHECK-NEXT:    lui a3, 1
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    lui a6, 5
-; CHECK-NEXT:    vsrl.vi v0, v16, 8
-; CHECK-NEXT:    vsll.vi v16, v16, 8
-; CHECK-NEXT:    li t4, 16
-; CHECK-NEXT:    li a7, 32
-; CHECK-NEXT:    li a4, 64
-; CHECK-NEXT:    li a5, 128
-; CHECK-NEXT:    li t0, 256
-; CHECK-NEXT:    li t1, 512
-; CHECK-NEXT:    li t2, 1024
-; CHECK-NEXT:    li t5, 1
-; CHECK-NEXT:    lui t3, 2
-; CHECK-NEXT:    addi a2, a3, -241
-; CHECK-NEXT:    addi a1, a0, 819
-; CHECK-NEXT:    addi a0, a6, 1365
-; CHECK-NEXT:    vor.vv v16, v16, v0
-; CHECK-NEXT:    vsrl.vi v0, v16, 4
-; CHECK-NEXT:    vand.vx v16, v16, a2
-; CHECK-NEXT:    vand.vx v0, v0, a2
-; CHECK-NEXT:    vsll.vi v16, v16, 4
-; CHECK-NEXT:    vor.vv v16, v0, v16
-; CHECK-NEXT:    vsrl.vi v0, v16, 2
-; CHECK-NEXT:    vand.vx v16, v16, a1
-; CHECK-NEXT:    vand.vx v0, v0, a1
-; CHECK-NEXT:    vsll.vi v16, v16, 2
-; CHECK-NEXT:    vor.vv v16, v0, v16
-; CHECK-NEXT:    vsrl.vi v0, v16, 1
-; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vand.vx v0, v0, a0
-; CHECK-NEXT:    vadd.vv v16, v16, v16
-; CHECK-NEXT:    vor.vv v0, v0, v16
-; CHECK-NEXT:    vand.vx v16, v0, t4
-; CHECK-NEXT:    csrr a6, vlenb
-; CHECK-NEXT:    slli a6, a6, 3
-; CHECK-NEXT:    mv t4, a6
-; CHECK-NEXT:    slli a6, a6, 1
-; CHECK-NEXT:    add t4, t4, a6
-; CHECK-NEXT:    slli a6, a6, 1
-; CHECK-NEXT:    add t4, t4, a6
-; CHECK-NEXT:    slli a6, a6, 1
-; CHECK-NEXT:    add a6, a6, t4
-; CHECK-NEXT:    add a6, sp, a6
-; CHECK-NEXT:    addi a6, a6, 16
-; CHECK-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    lui a6, 4
-; CHECK-NEXT:    vor.vv v8, v8, v24
-; CHECK-NEXT:    vsrl.vi v16, v8, 4
-; CHECK-NEXT:    vand.vx v8, v8, a2
-; CHECK-NEXT:    vand.vx v16, v16, a2
-; CHECK-NEXT:    vsll.vi v8, v8, 4
-; CHECK-NEXT:    vor.vv v8, v16, v8
-; CHECK-NEXT:    vsrl.vi v16, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a1
-; CHECK-NEXT:    vand.vx v16, v16, a1
-; CHECK-NEXT:    vsll.vi v8, v8, 2
-; CHECK-NEXT:    vor.vv v8, v16, v8
-; CHECK-NEXT:    vsrl.vi v16, v8, 1
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vadd.vv v8, v8, v8
-; CHECK-NEXT:    vor.vv v8, v16, v8
-; CHECK-NEXT:    vand.vx v16, v0, a7
-; CHECK-NEXT:    csrr a7, vlenb
-; CHECK-NEXT:    slli a7, a7, 4
-; CHECK-NEXT:    mv t4, a7
-; CHECK-NEXT:    slli a7, a7, 1
-; CHECK-NEXT:    add t4, t4, a7
-; CHECK-NEXT:    slli a7, a7, 1
-; CHECK-NEXT:    add a7, a7, t4
-; CHECK-NEXT:    add a7, sp, a7
-; CHECK-NEXT:    addi a7, a7, 16
-; CHECK-NEXT:    vs8r.v v16, (a7) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    lui a7, 8
-; CHECK-NEXT:    slli t5, t5, 11
-; CHECK-NEXT:    vand.vx v16, v0, a4
-; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    slli a4, a4, 3
-; CHECK-NEXT:    mv t4, a4
-; CHECK-NEXT:    slli a4, a4, 2
-; CHECK-NEXT:    add t4, t4, a4
-; CHECK-NEXT:    slli a4, a4, 1
-; CHECK-NEXT:    add a4, a4, t4
-; CHECK-NEXT:    add a4, sp, a4
-; CHECK-NEXT:    addi a4, a4, 16
-; CHECK-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vand.vx v16, v0, a5
-; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    slli a4, a4, 5
-; CHECK-NEXT:    mv a5, a4
-; CHECK-NEXT:    slli a4, a4, 1
-; CHECK-NEXT:    add a4, a4, a5
-; CHECK-NEXT:    add a4, sp, a4
-; CHECK-NEXT:    addi a4, a4, 16
-; CHECK-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vand.vx v16, v0, t0
-; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    slli a4, a4, 3
-; CHECK-NEXT:    mv a5, a4
-; CHECK-NEXT:    slli a4, a4, 1
-; CHECK-NEXT:    add a5, a5, a4
-; CHECK-NEXT:    slli a4, a4, 2
-; CHECK-NEXT:    add a4, a4, a5
-; CHECK-NEXT:    add a4, sp, a4
-; CHECK-NEXT:    addi a4, a4, 16
-; CHECK-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vand.vx v16, v0, t1
-; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    slli a4, a4, 4
-; CHECK-NEXT:    mv a5, a4
-; CHECK-NEXT:    slli a4, a4, 2
-; CHECK-NEXT:    add a4, a4, a5
-; CHECK-NEXT:    add a4, sp, a4
-; CHECK-NEXT:    addi a4, a4, 16
-; CHECK-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vand.vx v16, v0, t2
-; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    slli a4, a4, 3
-; CHECK-NEXT:    mv a5, a4
-; CHECK-NEXT:    slli a4, a4, 3
-; CHECK-NEXT:    add a4, a4, a5
-; CHECK-NEXT:    add a4, sp, a4
-; CHECK-NEXT:    addi a4, a4, 16
-; CHECK-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vand.vx v16, v0, t5
-; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    slli a4, a4, 6
-; CHECK-NEXT:    add a4, sp, a4
-; CHECK-NEXT:    addi a4, a4, 16
-; CHECK-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vand.vx v16, v0, a3
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a4, a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vand.vx v16, v0, t3
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 4
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vand.vi v16, v0, 2
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 2
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vand.vi v24, v0, 1
-; CHECK-NEXT:    vand.vi v16, v0, 4
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vand.vi v16, v0, 8
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vand.vx v16, v0, a6
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 5
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vand.vx v16, v0, a7
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 2
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vmul.vv v16, v8, v16
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 2
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vmul.vv v16, v8, v24
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vmul.vv v24, v8, v16
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vmul.vv v0, v8, v16
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a4, a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a4, a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vmul.vv v16, v8, v16
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a4, a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a4, a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 4
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a4, a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vmul.vv v16, v8, v16
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 4
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a4, a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 2
-; CHECK-NEXT:    add a4, a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vmul.vv v16, v8, v16
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 2
-; CHECK-NEXT:    add a4, a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 5
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vmul.vv v16, v8, v16
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 5
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a4, a4, a3
-; CHECK-NEXT:    slli a3, a3, 2
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vmul.vv v16, v8, v16
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a4, a4, a3
-; CHECK-NEXT:    slli a3, a3, 2
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 4
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 2
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vmul.vv v16, v8, v16
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 4
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 2
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vmul.vv v16, v8, v16
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 6
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vmul.vv v16, v8, v16
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 6
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a4, a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vmul.vv v16, v8, v16
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a4, a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 4
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vmul.vv v16, v8, v16
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 4
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 5
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vmul.vv v16, v8, v16
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 5
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vmul.vv v8, v8, v16
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 2
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vxor.vv v16, v8, v16
-; CHECK-NEXT:    vxor.vv v16, v16, v24
-; CHECK-NEXT:    vxor.vv v16, v16, v0
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a4, a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a4, a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vxor.vv v16, v16, v8
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 4
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a4, a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vxor.vv v16, v16, v8
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 2
-; CHECK-NEXT:    add a4, a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vxor.vv v16, v16, v8
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 5
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vxor.vv v16, v16, v8
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a4, a4, a3
-; CHECK-NEXT:    slli a3, a3, 2
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vxor.vv v24, v16, v8
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 4
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 2
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vxor.vv v24, v24, v8
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vxor.vv v24, v24, v8
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 6
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vxor.vv v24, v24, v8
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a4, a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vxor.vv v24, v24, v8
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 4
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vxor.vv v24, v24, v8
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 5
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vxor.vv v24, v24, v8
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vxor.vv v8, v24, v8
-; CHECK-NEXT:    vsll.vi v16, v16, 8
-; CHECK-NEXT:    vsrl.vi v8, v8, 8
-; CHECK-NEXT:    vor.vv v8, v16, v8
-; CHECK-NEXT:    vsrl.vi v16, v8, 4
-; CHECK-NEXT:    vand.vx v8, v8, a2
-; CHECK-NEXT:    vand.vx v16, v16, a2
-; CHECK-NEXT:    vsll.vi v8, v8, 4
-; CHECK-NEXT:    vor.vv v8, v16, v8
-; CHECK-NEXT:    vsrl.vi v16, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a1
-; CHECK-NEXT:    vand.vx v16, v16, a1
-; CHECK-NEXT:    vsll.vi v8, v8, 2
-; CHECK-NEXT:    vor.vv v8, v16, v8
-; CHECK-NEXT:    vsrl.vi v16, v8, 1
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vadd.vv v8, v8, v8
-; CHECK-NEXT:    vor.vv v8, v16, v8
-; CHECK-NEXT:    vsrl.vi v8, v8, 1
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 7
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-  %va.ext = zext <vscale x 32 x i16> %va to <vscale x 32 x i32>
-  %vb.ext = zext <vscale x 32 x i16> %vb to <vscale x 32 x i32>
-  %clmul = call <vscale x 32 x i32> @llvm.clmul.nxv32i32(<vscale x 32 x i32> %va.ext, <vscale x 32 x i32> %vb.ext)
-  %res.ext = lshr <vscale x 32 x i32> %clmul, splat(i32 16)
-  %res = trunc <vscale x 32 x i32> %res.ext to <vscale x 32 x i16>
-  ret <vscale x 32 x i16> %res
-}
-
-define <vscale x 32 x i16> @clmulh_nxv32i16_vx(<vscale x 32 x i16> %va, i16 %b) nounwind {
-; CHECK-LABEL: clmulh_nxv32i16_vx:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 7
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vmv.v.x v24, a0
-; CHECK-NEXT:    vsrl.vi v16, v8, 8
-; CHECK-NEXT:    vsll.vi v8, v8, 8
-; CHECK-NEXT:    lui a3, 1
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    lui a7, 5
-; CHECK-NEXT:    li t4, 16
-; CHECK-NEXT:    li a6, 32
-; CHECK-NEXT:    li a4, 64
-; CHECK-NEXT:    li a5, 128
-; CHECK-NEXT:    li t0, 256
-; CHECK-NEXT:    li t1, 512
-; CHECK-NEXT:    li t2, 1024
-; CHECK-NEXT:    li t5, 1
-; CHECK-NEXT:    lui t3, 2
-; CHECK-NEXT:    addi a2, a3, -241
-; CHECK-NEXT:    addi a1, a0, 819
-; CHECK-NEXT:    addi a0, a7, 1365
-; CHECK-NEXT:    vsrl.vi v0, v24, 8
-; CHECK-NEXT:    vsll.vi v24, v24, 8
-; CHECK-NEXT:    vor.vv v24, v24, v0
-; CHECK-NEXT:    vsrl.vi v0, v24, 4
-; CHECK-NEXT:    vand.vx v24, v24, a2
-; CHECK-NEXT:    vand.vx v0, v0, a2
-; CHECK-NEXT:    vsll.vi v24, v24, 4
-; CHECK-NEXT:    vor.vv v24, v0, v24
-; CHECK-NEXT:    vsrl.vi v0, v24, 2
-; CHECK-NEXT:    vand.vx v24, v24, a1
-; CHECK-NEXT:    vand.vx v0, v0, a1
-; CHECK-NEXT:    vsll.vi v24, v24, 2
-; CHECK-NEXT:    vor.vv v24, v0, v24
-; CHECK-NEXT:    vsrl.vi v0, v24, 1
-; CHECK-NEXT:    vand.vx v24, v24, a0
-; CHECK-NEXT:    vand.vx v0, v0, a0
-; CHECK-NEXT:    vadd.vv v24, v24, v24
-; CHECK-NEXT:    vor.vv v0, v0, v24
-; CHECK-NEXT:    vand.vx v24, v0, t4
-; CHECK-NEXT:    csrr a7, vlenb
-; CHECK-NEXT:    slli a7, a7, 3
-; CHECK-NEXT:    mv t4, a7
-; CHECK-NEXT:    slli a7, a7, 1
-; CHECK-NEXT:    add t4, t4, a7
-; CHECK-NEXT:    slli a7, a7, 1
-; CHECK-NEXT:    add t4, t4, a7
-; CHECK-NEXT:    slli a7, a7, 1
-; CHECK-NEXT:    add a7, a7, t4
-; CHECK-NEXT:    add a7, sp, a7
-; CHECK-NEXT:    addi a7, a7, 16
-; CHECK-NEXT:    vs8r.v v24, (a7) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    lui a7, 4
-; CHECK-NEXT:    vor.vv v8, v8, v16
-; CHECK-NEXT:    vsrl.vi v16, v8, 4
-; CHECK-NEXT:    vand.vx v8, v8, a2
-; CHECK-NEXT:    vand.vx v16, v16, a2
-; CHECK-NEXT:    vsll.vi v8, v8, 4
-; CHECK-NEXT:    vor.vv v8, v16, v8
-; CHECK-NEXT:    vsrl.vi v16, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a1
-; CHECK-NEXT:    vand.vx v16, v16, a1
-; CHECK-NEXT:    vsll.vi v8, v8, 2
-; CHECK-NEXT:    vor.vv v8, v16, v8
-; CHECK-NEXT:    vsrl.vi v16, v8, 1
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vadd.vv v8, v8, v8
-; CHECK-NEXT:    vor.vv v8, v16, v8
-; CHECK-NEXT:    vand.vx v16, v0, a6
-; CHECK-NEXT:    csrr a6, vlenb
-; CHECK-NEXT:    slli a6, a6, 4
-; CHECK-NEXT:    mv t4, a6
-; CHECK-NEXT:    slli a6, a6, 1
-; CHECK-NEXT:    add t4, t4, a6
-; CHECK-NEXT:    slli a6, a6, 1
-; CHECK-NEXT:    add a6, a6, t4
-; CHECK-NEXT:    add a6, sp, a6
-; CHECK-NEXT:    addi a6, a6, 16
-; CHECK-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    lui a6, 8
-; CHECK-NEXT:    slli t5, t5, 11
-; CHECK-NEXT:    vand.vx v16, v0, a4
-; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    slli a4, a4, 3
-; CHECK-NEXT:    mv t4, a4
-; CHECK-NEXT:    slli a4, a4, 2
-; CHECK-NEXT:    add t4, t4, a4
-; CHECK-NEXT:    slli a4, a4, 1
-; CHECK-NEXT:    add a4, a4, t4
-; CHECK-NEXT:    add a4, sp, a4
-; CHECK-NEXT:    addi a4, a4, 16
-; CHECK-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vand.vx v16, v0, a5
-; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    slli a4, a4, 5
-; CHECK-NEXT:    mv a5, a4
-; CHECK-NEXT:    slli a4, a4, 1
-; CHECK-NEXT:    add a4, a4, a5
-; CHECK-NEXT:    add a4, sp, a4
-; CHECK-NEXT:    addi a4, a4, 16
-; CHECK-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vand.vx v16, v0, t0
-; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    slli a4, a4, 3
-; CHECK-NEXT:    mv a5, a4
-; CHECK-NEXT:    slli a4, a4, 1
-; CHECK-NEXT:    add a5, a5, a4
-; CHECK-NEXT:    slli a4, a4, 2
-; CHECK-NEXT:    add a4, a4, a5
-; CHECK-NEXT:    add a4, sp, a4
-; CHECK-NEXT:    addi a4, a4, 16
-; CHECK-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vand.vx v16, v0, t1
-; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    slli a4, a4, 4
-; CHECK-NEXT:    mv a5, a4
-; CHECK-NEXT:    slli a4, a4, 2
-; CHECK-NEXT:    add a4, a4, a5
-; CHECK-NEXT:    add a4, sp, a4
-; CHECK-NEXT:    addi a4, a4, 16
-; CHECK-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vand.vx v16, v0, t2
-; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    slli a4, a4, 3
-; CHECK-NEXT:    mv a5, a4
-; CHECK-NEXT:    slli a4, a4, 3
-; CHECK-NEXT:    add a4, a4, a5
-; CHECK-NEXT:    add a4, sp, a4
-; CHECK-NEXT:    addi a4, a4, 16
-; CHECK-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vand.vx v16, v0, t5
-; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    slli a4, a4, 6
-; CHECK-NEXT:    add a4, sp, a4
-; CHECK-NEXT:    addi a4, a4, 16
-; CHECK-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vand.vx v16, v0, a3
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a4, a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vand.vx v16, v0, t3
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 4
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vand.vi v16, v0, 2
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 2
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vand.vi v24, v0, 1
-; CHECK-NEXT:    vand.vi v16, v0, 4
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vand.vi v16, v0, 8
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vand.vx v16, v0, a7
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 5
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vand.vx v16, v0, a6
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 2
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vmul.vv v16, v8, v16
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 2
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vmul.vv v16, v8, v24
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vmul.vv v24, v8, v16
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vmul.vv v0, v8, v16
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a4, a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a4, a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vmul.vv v16, v8, v16
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a4, a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a4, a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 4
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a4, a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vmul.vv v16, v8, v16
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 4
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a4, a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 2
-; CHECK-NEXT:    add a4, a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vmul.vv v16, v8, v16
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 2
-; CHECK-NEXT:    add a4, a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 5
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vmul.vv v16, v8, v16
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 5
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a4, a4, a3
-; CHECK-NEXT:    slli a3, a3, 2
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vmul.vv v16, v8, v16
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a4, a4, a3
-; CHECK-NEXT:    slli a3, a3, 2
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 4
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 2
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vmul.vv v16, v8, v16
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 4
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 2
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vmul.vv v16, v8, v16
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 6
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vmul.vv v16, v8, v16
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 6
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a4, a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vmul.vv v16, v8, v16
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a4, a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 4
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vmul.vv v16, v8, v16
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 4
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 5
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vmul.vv v16, v8, v16
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 5
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vmul.vv v8, v8, v16
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 2
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vxor.vv v16, v8, v16
-; CHECK-NEXT:    vxor.vv v16, v16, v24
-; CHECK-NEXT:    vxor.vv v16, v16, v0
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a4, a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a4, a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vxor.vv v16, v16, v8
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 4
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a4, a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vxor.vv v16, v16, v8
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 2
-; CHECK-NEXT:    add a4, a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vxor.vv v16, v16, v8
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 5
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vxor.vv v16, v16, v8
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a4, a4, a3
-; CHECK-NEXT:    slli a3, a3, 2
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vxor.vv v24, v16, v8
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 4
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 2
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vxor.vv v24, v24, v8
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vxor.vv v24, v24, v8
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 6
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vxor.vv v24, v24, v8
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a4, a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vxor.vv v24, v24, v8
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 4
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vxor.vv v24, v24, v8
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 5
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vxor.vv v24, v24, v8
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vxor.vv v8, v24, v8
-; CHECK-NEXT:    vsll.vi v16, v16, 8
-; CHECK-NEXT:    vsrl.vi v8, v8, 8
-; CHECK-NEXT:    vor.vv v8, v16, v8
-; CHECK-NEXT:    vsrl.vi v16, v8, 4
-; CHECK-NEXT:    vand.vx v8, v8, a2
-; CHECK-NEXT:    vand.vx v16, v16, a2
-; CHECK-NEXT:    vsll.vi v8, v8, 4
-; CHECK-NEXT:    vor.vv v8, v16, v8
-; CHECK-NEXT:    vsrl.vi v16, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a1
-; CHECK-NEXT:    vand.vx v16, v16, a1
-; CHECK-NEXT:    vsll.vi v8, v8, 2
-; CHECK-NEXT:    vor.vv v8, v16, v8
-; CHECK-NEXT:    vsrl.vi v16, v8, 1
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vadd.vv v8, v8, v8
-; CHECK-NEXT:    vor.vv v8, v16, v8
-; CHECK-NEXT:    vsrl.vi v8, v8, 1
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 7
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-  %elt.head = insertelement <vscale x 32 x i16> poison, i16 %b, i32 0
-  %vb = shufflevector <vscale x 32 x i16> %elt.head, <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer
-  %va.ext = zext <vscale x 32 x i16> %va to <vscale x 32 x i32>
-  %vb.ext = zext <vscale x 32 x i16> %vb to <vscale x 32 x i32>
-  %clmul = call <vscale x 32 x i32> @llvm.clmul.nxv32i32(<vscale x 32 x i32> %va.ext, <vscale x 32 x i32> %vb.ext)
-  %res.ext = lshr <vscale x 32 x i32> %clmul, splat(i32 16)
-  %res = trunc <vscale x 32 x i32> %res.ext to <vscale x 32 x i16>
-  ret <vscale x 32 x i16> %res
-}
-
-define <vscale x 1 x i32> @clmulh_nxv1i32_vv(<vscale x 1 x i32> %va, <vscale x 1 x i32> %vb) nounwind {
-; RV32V-LABEL: clmulh_nxv1i32_vv:
+; RV32V-LABEL: clmulh_nxv32i16_vv:
 ; RV32V:       # %bb.0:
 ; RV32V-NEXT:    addi sp, sp, -16
-; RV32V-NEXT:    lui a0, 524288
-; RV32V-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32V-NEXT:    vzext.vf2 v10, v8
-; RV32V-NEXT:    vzext.vf2 v8, v9
-; RV32V-NEXT:    li a2, 16
-; RV32V-NEXT:    li a1, 32
-; RV32V-NEXT:    vand.vx v9, v8, a2
-; RV32V-NEXT:    li a2, 64
-; RV32V-NEXT:    vand.vi v11, v8, 2
-; RV32V-NEXT:    vand.vi v12, v8, 1
-; RV32V-NEXT:    vmul.vv v11, v10, v11
-; RV32V-NEXT:    vmul.vv v12, v10, v12
-; RV32V-NEXT:    vxor.vv v11, v12, v11
-; RV32V-NEXT:    vand.vi v12, v8, 4
-; RV32V-NEXT:    vmul.vv v12, v10, v12
-; RV32V-NEXT:    vxor.vv v11, v11, v12
-; RV32V-NEXT:    vand.vi v12, v8, 8
-; RV32V-NEXT:    vmul.vv v12, v10, v12
-; RV32V-NEXT:    vxor.vv v11, v11, v12
-; RV32V-NEXT:    vand.vx v12, v8, a2
-; RV32V-NEXT:    li a2, 128
-; RV32V-NEXT:    vmul.vv v9, v10, v9
-; RV32V-NEXT:    vxor.vv v9, v11, v9
-; RV32V-NEXT:    vand.vx v11, v8, a1
-; RV32V-NEXT:    vmul.vv v11, v10, v11
-; RV32V-NEXT:    vxor.vv v9, v9, v11
-; RV32V-NEXT:    vand.vx v11, v8, a2
-; RV32V-NEXT:    li a2, 256
-; RV32V-NEXT:    vmul.vv v12, v10, v12
-; RV32V-NEXT:    vxor.vv v9, v9, v12
-; RV32V-NEXT:    vand.vx v12, v8, a2
-; RV32V-NEXT:    li a2, 512
-; RV32V-NEXT:    vmul.vv v11, v10, v11
-; RV32V-NEXT:    vxor.vv v9, v9, v11
-; RV32V-NEXT:    vand.vx v11, v8, a2
-; RV32V-NEXT:    li a2, 1024
-; RV32V-NEXT:    vmul.vv v12, v10, v12
-; RV32V-NEXT:    vxor.vv v9, v9, v12
-; RV32V-NEXT:    vand.vx v12, v8, a2
-; RV32V-NEXT:    lui a2, 1
-; RV32V-NEXT:    vmul.vv v11, v10, v11
-; RV32V-NEXT:    vxor.vv v9, v9, v11
-; RV32V-NEXT:    vand.vx v11, v8, a2
-; RV32V-NEXT:    li a2, 1
-; RV32V-NEXT:    slli a2, a2, 11
-; RV32V-NEXT:    vmul.vv v12, v10, v12
-; RV32V-NEXT:    vxor.vv v9, v9, v12
-; RV32V-NEXT:    vand.vx v12, v8, a2
-; RV32V-NEXT:    lui a2, 2
-; RV32V-NEXT:    vmul.vv v12, v10, v12
-; RV32V-NEXT:    vxor.vv v9, v9, v12
-; RV32V-NEXT:    vand.vx v12, v8, a2
-; RV32V-NEXT:    lui a2, 4
-; RV32V-NEXT:    vmul.vv v11, v10, v11
-; RV32V-NEXT:    vxor.vv v9, v9, v11
-; RV32V-NEXT:    vand.vx v11, v8, a2
-; RV32V-NEXT:    lui a2, 8
-; RV32V-NEXT:    vmul.vv v12, v10, v12
-; RV32V-NEXT:    vxor.vv v9, v9, v12
-; RV32V-NEXT:    vand.vx v12, v8, a2
-; RV32V-NEXT:    lui a2, 16
-; RV32V-NEXT:    vmul.vv v11, v10, v11
-; RV32V-NEXT:    vxor.vv v9, v9, v11
-; RV32V-NEXT:    vand.vx v11, v8, a2
-; RV32V-NEXT:    lui a2, 32
-; RV32V-NEXT:    vmul.vv v12, v10, v12
-; RV32V-NEXT:    vxor.vv v9, v9, v12
-; RV32V-NEXT:    vand.vx v12, v8, a2
-; RV32V-NEXT:    lui a2, 64
-; RV32V-NEXT:    vmul.vv v11, v10, v11
-; RV32V-NEXT:    vxor.vv v9, v9, v11
-; RV32V-NEXT:    vand.vx v11, v8, a2
-; RV32V-NEXT:    lui a2, 128
-; RV32V-NEXT:    vmul.vv v12, v10, v12
-; RV32V-NEXT:    vxor.vv v9, v9, v12
-; RV32V-NEXT:    vand.vx v12, v8, a2
-; RV32V-NEXT:    lui a2, 256
-; RV32V-NEXT:    vmul.vv v11, v10, v11
-; RV32V-NEXT:    vxor.vv v9, v9, v11
-; RV32V-NEXT:    vand.vx v11, v8, a2
-; RV32V-NEXT:    lui a2, 512
-; RV32V-NEXT:    vmul.vv v12, v10, v12
-; RV32V-NEXT:    vxor.vv v9, v9, v12
-; RV32V-NEXT:    vand.vx v12, v8, a2
-; RV32V-NEXT:    lui a2, 1024
-; RV32V-NEXT:    vmul.vv v11, v10, v11
-; RV32V-NEXT:    vxor.vv v9, v9, v11
-; RV32V-NEXT:    vand.vx v11, v8, a2
-; RV32V-NEXT:    lui a2, 2048
-; RV32V-NEXT:    vmul.vv v12, v10, v12
-; RV32V-NEXT:    vxor.vv v9, v9, v12
-; RV32V-NEXT:    vand.vx v12, v8, a2
-; RV32V-NEXT:    lui a2, 4096
-; RV32V-NEXT:    vmul.vv v11, v10, v11
-; RV32V-NEXT:    vxor.vv v9, v9, v11
-; RV32V-NEXT:    vand.vx v11, v8, a2
-; RV32V-NEXT:    lui a2, 8192
-; RV32V-NEXT:    vmul.vv v12, v10, v12
-; RV32V-NEXT:    vxor.vv v9, v9, v12
-; RV32V-NEXT:    vand.vx v12, v8, a2
-; RV32V-NEXT:    lui a2, 16384
-; RV32V-NEXT:    vmul.vv v11, v10, v11
-; RV32V-NEXT:    vxor.vv v9, v9, v11
-; RV32V-NEXT:    vand.vx v11, v8, a2
-; RV32V-NEXT:    lui a2, 32768
-; RV32V-NEXT:    vmul.vv v12, v10, v12
-; RV32V-NEXT:    vxor.vv v9, v9, v12
-; RV32V-NEXT:    vand.vx v12, v8, a2
-; RV32V-NEXT:    lui a2, 65536
-; RV32V-NEXT:    vmul.vv v11, v10, v11
-; RV32V-NEXT:    vxor.vv v9, v9, v11
-; RV32V-NEXT:    vand.vx v11, v8, a2
-; RV32V-NEXT:    lui a2, 131072
-; RV32V-NEXT:    vmul.vv v12, v10, v12
-; RV32V-NEXT:    vxor.vv v9, v9, v12
-; RV32V-NEXT:    vand.vx v12, v8, a2
-; RV32V-NEXT:    addi a2, sp, 8
-; RV32V-NEXT:    sw a0, 8(sp)
-; RV32V-NEXT:    sw zero, 12(sp)
-; RV32V-NEXT:    vmul.vv v11, v10, v11
-; RV32V-NEXT:    vxor.vv v9, v9, v11
-; RV32V-NEXT:    vlse64.v v11, (a2), zero
-; RV32V-NEXT:    vmul.vv v12, v10, v12
-; RV32V-NEXT:    vxor.vv v9, v9, v12
-; RV32V-NEXT:    lui a0, 262144
-; RV32V-NEXT:    vand.vv v11, v8, v11
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 7
+; RV32V-NEXT:    sub sp, sp, a0
+; RV32V-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
+; RV32V-NEXT:    vsrl.vi v24, v8, 8
+; RV32V-NEXT:    vsll.vi v8, v8, 8
+; RV32V-NEXT:    lui a3, 1
+; RV32V-NEXT:    lui a0, 3
+; RV32V-NEXT:    lui a6, 5
+; RV32V-NEXT:    vsrl.vi v0, v16, 8
+; RV32V-NEXT:    vsll.vi v16, v16, 8
+; RV32V-NEXT:    li t4, 16
+; RV32V-NEXT:    li a7, 32
+; RV32V-NEXT:    li a4, 64
+; RV32V-NEXT:    li a5, 128
+; RV32V-NEXT:    li t0, 256
+; RV32V-NEXT:    li t1, 512
+; RV32V-NEXT:    li t2, 1024
+; RV32V-NEXT:    li t5, 1
+; RV32V-NEXT:    lui t3, 2
+; RV32V-NEXT:    addi a2, a3, -241
+; RV32V-NEXT:    addi a1, a0, 819
+; RV32V-NEXT:    addi a0, a6, 1365
+; RV32V-NEXT:    vor.vv v16, v16, v0
+; RV32V-NEXT:    vsrl.vi v0, v16, 4
+; RV32V-NEXT:    vand.vx v16, v16, a2
+; RV32V-NEXT:    vand.vx v0, v0, a2
+; RV32V-NEXT:    vsll.vi v16, v16, 4
+; RV32V-NEXT:    vor.vv v16, v0, v16
+; RV32V-NEXT:    vsrl.vi v0, v16, 2
+; RV32V-NEXT:    vand.vx v16, v16, a1
+; RV32V-NEXT:    vand.vx v0, v0, a1
+; RV32V-NEXT:    vsll.vi v16, v16, 2
+; RV32V-NEXT:    vor.vv v16, v0, v16
+; RV32V-NEXT:    vsrl.vi v0, v16, 1
+; RV32V-NEXT:    vand.vx v16, v16, a0
+; RV32V-NEXT:    vand.vx v0, v0, a0
+; RV32V-NEXT:    vadd.vv v16, v16, v16
+; RV32V-NEXT:    vor.vv v0, v0, v16
+; RV32V-NEXT:    vand.vx v16, v0, t4
+; RV32V-NEXT:    csrr a6, vlenb
+; RV32V-NEXT:    slli a6, a6, 3
+; RV32V-NEXT:    mv t4, a6
+; RV32V-NEXT:    slli a6, a6, 1
+; RV32V-NEXT:    add t4, t4, a6
+; RV32V-NEXT:    slli a6, a6, 1
+; RV32V-NEXT:    add t4, t4, a6
+; RV32V-NEXT:    slli a6, a6, 1
+; RV32V-NEXT:    add a6, a6, t4
+; RV32V-NEXT:    add a6, sp, a6
+; RV32V-NEXT:    addi a6, a6, 16
+; RV32V-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    lui a6, 4
+; RV32V-NEXT:    vor.vv v8, v8, v24
+; RV32V-NEXT:    vsrl.vi v16, v8, 4
+; RV32V-NEXT:    vand.vx v8, v8, a2
+; RV32V-NEXT:    vand.vx v16, v16, a2
+; RV32V-NEXT:    vsll.vi v8, v8, 4
+; RV32V-NEXT:    vor.vv v8, v16, v8
+; RV32V-NEXT:    vsrl.vi v16, v8, 2
+; RV32V-NEXT:    vand.vx v8, v8, a1
+; RV32V-NEXT:    vand.vx v16, v16, a1
+; RV32V-NEXT:    vsll.vi v8, v8, 2
+; RV32V-NEXT:    vor.vv v8, v16, v8
+; RV32V-NEXT:    vsrl.vi v16, v8, 1
 ; RV32V-NEXT:    vand.vx v8, v8, a0
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    vxor.vv v8, v9, v8
-; RV32V-NEXT:    vmul.vv v9, v10, v11
-; RV32V-NEXT:    vxor.vv v8, v8, v9
-; RV32V-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; RV32V-NEXT:    vnsrl.wx v8, v8, a1
+; RV32V-NEXT:    vand.vx v16, v16, a0
+; RV32V-NEXT:    vadd.vv v8, v8, v8
+; RV32V-NEXT:    vor.vv v8, v16, v8
+; RV32V-NEXT:    vand.vx v16, v0, a7
+; RV32V-NEXT:    csrr a7, vlenb
+; RV32V-NEXT:    slli a7, a7, 4
+; RV32V-NEXT:    mv t4, a7
+; RV32V-NEXT:    slli a7, a7, 1
+; RV32V-NEXT:    add t4, t4, a7
+; RV32V-NEXT:    slli a7, a7, 1
+; RV32V-NEXT:    add a7, a7, t4
+; RV32V-NEXT:    add a7, sp, a7
+; RV32V-NEXT:    addi a7, a7, 16
+; RV32V-NEXT:    vs8r.v v16, (a7) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    lui a7, 8
+; RV32V-NEXT:    slli t5, t5, 11
+; RV32V-NEXT:    vand.vx v16, v0, a4
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    mv t4, a4
+; RV32V-NEXT:    slli a4, a4, 2
+; RV32V-NEXT:    add t4, t4, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a4, a4, t4
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v16, v0, a5
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 5
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v16, v0, t0
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a5, a5, a4
+; RV32V-NEXT:    slli a4, a4, 2
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v16, v0, t1
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 4
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 2
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v16, v0, t2
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v16, v0, t5
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 6
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v16, v0, a3
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 3
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a4, a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v16, v0, t3
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 4
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vi v16, v0, 2
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 3
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 2
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vi v24, v0, 1
+; RV32V-NEXT:    vand.vi v16, v0, 4
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 3
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vi v16, v0, 8
+; RV32V-NEXT:    addi a3, sp, 16
+; RV32V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v16, v0, a6
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 5
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v16, v0, a7
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 3
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 3
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 2
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 3
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 2
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vmul.vv v16, v8, v24
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 3
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v24, v8, v16
+; RV32V-NEXT:    addi a3, sp, 16
+; RV32V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v0, v8, v16
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 3
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a4, a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a4, a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 3
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a4, a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a4, a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 4
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a4, a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 4
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a4, a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 3
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 2
+; RV32V-NEXT:    add a4, a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 3
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 2
+; RV32V-NEXT:    add a4, a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 5
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 5
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 3
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a4, a4, a3
+; RV32V-NEXT:    slli a3, a3, 2
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 3
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a4, a4, a3
+; RV32V-NEXT:    slli a3, a3, 2
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 4
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 2
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 4
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 2
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 3
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 3
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 3
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 3
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 6
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 6
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 3
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a4, a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 3
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a4, a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 4
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 4
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 5
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 5
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 3
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v8, v16
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 3
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 3
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 2
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v8, v16
+; RV32V-NEXT:    vxor.vv v16, v16, v24
+; RV32V-NEXT:    vxor.vv v16, v16, v0
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 3
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a4, a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a4, a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v8
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 4
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a4, a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v8
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 3
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 2
+; RV32V-NEXT:    add a4, a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v8
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 5
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v8
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 3
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a4, a4, a3
+; RV32V-NEXT:    slli a3, a3, 2
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v24, v16, v8
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 4
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 2
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v24, v24, v8
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 3
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 3
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v24, v24, v8
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 6
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v24, v24, v8
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 3
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a4, a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v24, v24, v8
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 4
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v24, v24, v8
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 5
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v24, v24, v8
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 3
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v24, v8
+; RV32V-NEXT:    vsll.vi v16, v16, 8
+; RV32V-NEXT:    vsrl.vi v8, v8, 8
+; RV32V-NEXT:    vor.vv v8, v16, v8
+; RV32V-NEXT:    vsrl.vi v16, v8, 4
+; RV32V-NEXT:    vand.vx v8, v8, a2
+; RV32V-NEXT:    vand.vx v16, v16, a2
+; RV32V-NEXT:    vsll.vi v8, v8, 4
+; RV32V-NEXT:    vor.vv v8, v16, v8
+; RV32V-NEXT:    vsrl.vi v16, v8, 2
+; RV32V-NEXT:    vand.vx v8, v8, a1
+; RV32V-NEXT:    vand.vx v16, v16, a1
+; RV32V-NEXT:    vsll.vi v8, v8, 2
+; RV32V-NEXT:    vor.vv v8, v16, v8
+; RV32V-NEXT:    vsrl.vi v16, v8, 1
+; RV32V-NEXT:    vand.vx v8, v8, a0
+; RV32V-NEXT:    vand.vx v16, v16, a0
+; RV32V-NEXT:    vadd.vv v8, v8, v8
+; RV32V-NEXT:    vor.vv v8, v16, v8
+; RV32V-NEXT:    vsrl.vi v8, v8, 1
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 7
+; RV32V-NEXT:    add sp, sp, a0
 ; RV32V-NEXT:    addi sp, sp, 16
 ; RV32V-NEXT:    ret
 ;
-; RV64V-LABEL: clmulh_nxv1i32_vv:
+; RV64V-LABEL: clmulh_nxv32i16_vv:
 ; RV64V:       # %bb.0:
-; RV64V-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV64V-NEXT:    vzext.vf2 v10, v8
-; RV64V-NEXT:    vzext.vf2 v8, v9
-; RV64V-NEXT:    li a1, 16
-; RV64V-NEXT:    li a0, 32
-; RV64V-NEXT:    li a2, 64
-; RV64V-NEXT:    vand.vi v9, v8, 2
-; RV64V-NEXT:    vand.vi v11, v8, 1
-; RV64V-NEXT:    vmul.vv v9, v10, v9
-; RV64V-NEXT:    vmul.vv v11, v10, v11
-; RV64V-NEXT:    vxor.vv v9, v11, v9
-; RV64V-NEXT:    vand.vi v11, v8, 4
-; RV64V-NEXT:    vmul.vv v11, v10, v11
-; RV64V-NEXT:    vxor.vv v9, v9, v11
-; RV64V-NEXT:    vand.vi v11, v8, 8
-; RV64V-NEXT:    vmul.vv v11, v10, v11
-; RV64V-NEXT:    vxor.vv v9, v9, v11
-; RV64V-NEXT:    vand.vx v11, v8, a1
-; RV64V-NEXT:    li a1, 128
-; RV64V-NEXT:    vmul.vv v11, v10, v11
-; RV64V-NEXT:    vxor.vv v9, v9, v11
-; RV64V-NEXT:    vand.vx v11, v8, a0
-; RV64V-NEXT:    vmul.vv v11, v10, v11
-; RV64V-NEXT:    vxor.vv v9, v9, v11
-; RV64V-NEXT:    vand.vx v11, v8, a2
-; RV64V-NEXT:    li a2, 256
-; RV64V-NEXT:    vmul.vv v11, v10, v11
-; RV64V-NEXT:    vxor.vv v9, v9, v11
-; RV64V-NEXT:    vand.vx v11, v8, a1
-; RV64V-NEXT:    li a1, 512
-; RV64V-NEXT:    vmul.vv v11, v10, v11
-; RV64V-NEXT:    vxor.vv v9, v9, v11
-; RV64V-NEXT:    vand.vx v11, v8, a2
-; RV64V-NEXT:    li a2, 1024
-; RV64V-NEXT:    vmul.vv v11, v10, v11
-; RV64V-NEXT:    vxor.vv v9, v9, v11
-; RV64V-NEXT:    vand.vx v11, v8, a1
-; RV64V-NEXT:    li a1, 1
-; RV64V-NEXT:    vmul.vv v11, v10, v11
-; RV64V-NEXT:    vxor.vv v9, v9, v11
-; RV64V-NEXT:    vand.vx v11, v8, a2
-; RV64V-NEXT:    slli a2, a1, 11
-; RV64V-NEXT:    vmul.vv v11, v10, v11
-; RV64V-NEXT:    vxor.vv v9, v9, v11
-; RV64V-NEXT:    vand.vx v11, v8, a2
-; RV64V-NEXT:    lui a2, 1
-; RV64V-NEXT:    vmul.vv v11, v10, v11
-; RV64V-NEXT:    vxor.vv v9, v9, v11
-; RV64V-NEXT:    vand.vx v11, v8, a2
-; RV64V-NEXT:    lui a2, 2
-; RV64V-NEXT:    vmul.vv v11, v10, v11
-; RV64V-NEXT:    vxor.vv v9, v9, v11
-; RV64V-NEXT:    vand.vx v11, v8, a2
-; RV64V-NEXT:    lui a2, 4
-; RV64V-NEXT:    vmul.vv v11, v10, v11
-; RV64V-NEXT:    vxor.vv v9, v9, v11
-; RV64V-NEXT:    vand.vx v11, v8, a2
-; RV64V-NEXT:    lui a2, 8
-; RV64V-NEXT:    vmul.vv v11, v10, v11
-; RV64V-NEXT:    vxor.vv v9, v9, v11
-; RV64V-NEXT:    vand.vx v11, v8, a2
-; RV64V-NEXT:    lui a2, 16
-; RV64V-NEXT:    vmul.vv v11, v10, v11
-; RV64V-NEXT:    vxor.vv v9, v9, v11
-; RV64V-NEXT:    vand.vx v11, v8, a2
-; RV64V-NEXT:    lui a2, 32
-; RV64V-NEXT:    vmul.vv v11, v10, v11
-; RV64V-NEXT:    vxor.vv v9, v9, v11
-; RV64V-NEXT:    vand.vx v11, v8, a2
-; RV64V-NEXT:    lui a2, 64
-; RV64V-NEXT:    vmul.vv v11, v10, v11
-; RV64V-NEXT:    vxor.vv v9, v9, v11
-; RV64V-NEXT:    vand.vx v11, v8, a2
-; RV64V-NEXT:    lui a2, 128
-; RV64V-NEXT:    vmul.vv v11, v10, v11
-; RV64V-NEXT:    vxor.vv v9, v9, v11
-; RV64V-NEXT:    vand.vx v11, v8, a2
-; RV64V-NEXT:    lui a2, 256
-; RV64V-NEXT:    vmul.vv v11, v10, v11
-; RV64V-NEXT:    vxor.vv v9, v9, v11
-; RV64V-NEXT:    vand.vx v11, v8, a2
-; RV64V-NEXT:    lui a2, 512
-; RV64V-NEXT:    vmul.vv v11, v10, v11
-; RV64V-NEXT:    vxor.vv v9, v9, v11
-; RV64V-NEXT:    vand.vx v11, v8, a2
-; RV64V-NEXT:    lui a2, 1024
-; RV64V-NEXT:    vmul.vv v11, v10, v11
-; RV64V-NEXT:    vxor.vv v9, v9, v11
-; RV64V-NEXT:    vand.vx v11, v8, a2
-; RV64V-NEXT:    lui a2, 2048
-; RV64V-NEXT:    vmul.vv v11, v10, v11
-; RV64V-NEXT:    vxor.vv v9, v9, v11
-; RV64V-NEXT:    vand.vx v11, v8, a2
-; RV64V-NEXT:    lui a2, 4096
-; RV64V-NEXT:    vmul.vv v11, v10, v11
-; RV64V-NEXT:    vxor.vv v9, v9, v11
-; RV64V-NEXT:    vand.vx v11, v8, a2
-; RV64V-NEXT:    lui a2, 8192
-; RV64V-NEXT:    vmul.vv v11, v10, v11
-; RV64V-NEXT:    vxor.vv v9, v9, v11
-; RV64V-NEXT:    vand.vx v11, v8, a2
-; RV64V-NEXT:    lui a2, 16384
-; RV64V-NEXT:    vmul.vv v11, v10, v11
-; RV64V-NEXT:    vxor.vv v9, v9, v11
-; RV64V-NEXT:    vand.vx v11, v8, a2
-; RV64V-NEXT:    lui a2, 32768
-; RV64V-NEXT:    vmul.vv v11, v10, v11
-; RV64V-NEXT:    vxor.vv v9, v9, v11
-; RV64V-NEXT:    vand.vx v11, v8, a2
-; RV64V-NEXT:    lui a2, 65536
-; RV64V-NEXT:    vmul.vv v11, v10, v11
-; RV64V-NEXT:    vxor.vv v9, v9, v11
-; RV64V-NEXT:    vand.vx v11, v8, a2
-; RV64V-NEXT:    lui a2, 131072
-; RV64V-NEXT:    vmul.vv v11, v10, v11
-; RV64V-NEXT:    vxor.vv v9, v9, v11
-; RV64V-NEXT:    vand.vx v11, v8, a2
-; RV64V-NEXT:    lui a2, 262144
-; RV64V-NEXT:    vmul.vv v11, v10, v11
-; RV64V-NEXT:    vxor.vv v9, v9, v11
-; RV64V-NEXT:    vand.vx v11, v8, a2
-; RV64V-NEXT:    slli a1, a1, 31
-; RV64V-NEXT:    vmul.vv v11, v10, v11
-; RV64V-NEXT:    vand.vx v8, v8, a1
-; RV64V-NEXT:    vxor.vv v9, v9, v11
-; RV64V-NEXT:    vmul.vv v8, v10, v8
-; RV64V-NEXT:    vxor.vv v8, v9, v8
-; RV64V-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; RV64V-NEXT:    vnsrl.wx v8, v8, a0
-; RV64V-NEXT:    ret
-;
-; RV32ZVBC64-LABEL: clmulh_nxv1i32_vv:
-; RV32ZVBC64:       # %bb.0:
-; RV32ZVBC64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV32ZVBC64-NEXT:    vzext.vf2 v10, v9
-; RV32ZVBC64-NEXT:    vzext.vf2 v9, v8
-; RV32ZVBC64-NEXT:    vclmul.vv v8, v9, v10
-; RV32ZVBC64-NEXT:    li a0, 32
-; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; RV32ZVBC64-NEXT:    vnsrl.wx v8, v8, a0
-; RV32ZVBC64-NEXT:    ret
-;
-; RV64ZVBC64-LABEL: clmulh_nxv1i32_vv:
+; RV64V-NEXT:    addi sp, sp, -16
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 7
+; RV64V-NEXT:    sub sp, sp, a0
+; RV64V-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
+; RV64V-NEXT:    vsrl.vi v24, v8, 8
+; RV64V-NEXT:    vsll.vi v8, v8, 8
+; RV64V-NEXT:    lui a3, 1
+; RV64V-NEXT:    lui a0, 3
+; RV64V-NEXT:    lui a6, 5
+; RV64V-NEXT:    vsrl.vi v0, v16, 8
+; RV64V-NEXT:    vsll.vi v16, v16, 8
+; RV64V-NEXT:    li t4, 16
+; RV64V-NEXT:    li a7, 32
+; RV64V-NEXT:    li a4, 64
+; RV64V-NEXT:    li a5, 128
+; RV64V-NEXT:    li t0, 256
+; RV64V-NEXT:    li t1, 512
+; RV64V-NEXT:    li t2, 1024
+; RV64V-NEXT:    li t5, 1
+; RV64V-NEXT:    lui t3, 2
+; RV64V-NEXT:    addi a2, a3, -241
+; RV64V-NEXT:    addi a1, a0, 819
+; RV64V-NEXT:    addi a0, a6, 1365
+; RV64V-NEXT:    vor.vv v16, v16, v0
+; RV64V-NEXT:    vsrl.vi v0, v16, 4
+; RV64V-NEXT:    vand.vx v16, v16, a2
+; RV64V-NEXT:    vand.vx v0, v0, a2
+; RV64V-NEXT:    vsll.vi v16, v16, 4
+; RV64V-NEXT:    vor.vv v16, v0, v16
+; RV64V-NEXT:    vsrl.vi v0, v16, 2
+; RV64V-NEXT:    vand.vx v16, v16, a1
+; RV64V-NEXT:    vand.vx v0, v0, a1
+; RV64V-NEXT:    vsll.vi v16, v16, 2
+; RV64V-NEXT:    vor.vv v16, v0, v16
+; RV64V-NEXT:    vsrl.vi v0, v16, 1
+; RV64V-NEXT:    vand.vx v16, v16, a0
+; RV64V-NEXT:    vand.vx v0, v0, a0
+; RV64V-NEXT:    vadd.vv v16, v16, v16
+; RV64V-NEXT:    vor.vv v0, v0, v16
+; RV64V-NEXT:    vand.vx v16, v0, t4
+; RV64V-NEXT:    csrr a6, vlenb
+; RV64V-NEXT:    slli a6, a6, 3
+; RV64V-NEXT:    mv t4, a6
+; RV64V-NEXT:    slli a6, a6, 1
+; RV64V-NEXT:    add t4, t4, a6
+; RV64V-NEXT:    slli a6, a6, 1
+; RV64V-NEXT:    add t4, t4, a6
+; RV64V-NEXT:    slli a6, a6, 1
+; RV64V-NEXT:    add a6, a6, t4
+; RV64V-NEXT:    add a6, sp, a6
+; RV64V-NEXT:    addi a6, a6, 16
+; RV64V-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    lui a6, 4
+; RV64V-NEXT:    vor.vv v8, v8, v24
+; RV64V-NEXT:    vsrl.vi v16, v8, 4
+; RV64V-NEXT:    vand.vx v8, v8, a2
+; RV64V-NEXT:    vand.vx v16, v16, a2
+; RV64V-NEXT:    vsll.vi v8, v8, 4
+; RV64V-NEXT:    vor.vv v8, v16, v8
+; RV64V-NEXT:    vsrl.vi v16, v8, 2
+; RV64V-NEXT:    vand.vx v8, v8, a1
+; RV64V-NEXT:    vand.vx v16, v16, a1
+; RV64V-NEXT:    vsll.vi v8, v8, 2
+; RV64V-NEXT:    vor.vv v8, v16, v8
+; RV64V-NEXT:    vsrl.vi v16, v8, 1
+; RV64V-NEXT:    vand.vx v8, v8, a0
+; RV64V-NEXT:    vand.vx v16, v16, a0
+; RV64V-NEXT:    vadd.vv v8, v8, v8
+; RV64V-NEXT:    vor.vv v8, v16, v8
+; RV64V-NEXT:    vand.vx v16, v0, a7
+; RV64V-NEXT:    csrr a7, vlenb
+; RV64V-NEXT:    slli a7, a7, 4
+; RV64V-NEXT:    mv t4, a7
+; RV64V-NEXT:    slli a7, a7, 1
+; RV64V-NEXT:    add t4, t4, a7
+; RV64V-NEXT:    slli a7, a7, 1
+; RV64V-NEXT:    add a7, a7, t4
+; RV64V-NEXT:    add a7, sp, a7
+; RV64V-NEXT:    addi a7, a7, 16
+; RV64V-NEXT:    vs8r.v v16, (a7) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    lui a7, 8
+; RV64V-NEXT:    slli t5, t5, 11
+; RV64V-NEXT:    vand.vx v16, v0, a4
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv t4, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add t4, t4, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, t4
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 16
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, a5
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 5
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 16
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, t0
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 16
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, t1
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 4
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 16
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, t2
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 16
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, t5
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 6
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 16
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, a3
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, t3
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 4
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vi v16, v0, 2
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vi v24, v0, 1
+; RV64V-NEXT:    vand.vi v16, v0, 4
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vi v16, v0, 8
+; RV64V-NEXT:    addi a3, sp, 16
+; RV64V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, a6
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 5
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, a7
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vmul.vv v16, v8, v24
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v24, v8, v16
+; RV64V-NEXT:    addi a3, sp, 16
+; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v0, v8, v16
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 4
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 4
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 5
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 5
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 4
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 4
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 6
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 6
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 4
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 4
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 5
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 5
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v8, v8, v16
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v16, v8, v16
+; RV64V-NEXT:    vxor.vv v16, v16, v24
+; RV64V-NEXT:    vxor.vv v16, v16, v0
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v16, v16, v8
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 4
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v16, v16, v8
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v16, v16, v8
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 5
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v16, v16, v8
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v24, v16, v8
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 4
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v24, v24, v8
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v24, v24, v8
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 6
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v24, v24, v8
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v24, v24, v8
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 4
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v24, v24, v8
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 5
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v24, v24, v8
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v8, v24, v8
+; RV64V-NEXT:    vsll.vi v16, v16, 8
+; RV64V-NEXT:    vsrl.vi v8, v8, 8
+; RV64V-NEXT:    vor.vv v8, v16, v8
+; RV64V-NEXT:    vsrl.vi v16, v8, 4
+; RV64V-NEXT:    vand.vx v8, v8, a2
+; RV64V-NEXT:    vand.vx v16, v16, a2
+; RV64V-NEXT:    vsll.vi v8, v8, 4
+; RV64V-NEXT:    vor.vv v8, v16, v8
+; RV64V-NEXT:    vsrl.vi v16, v8, 2
+; RV64V-NEXT:    vand.vx v8, v8, a1
+; RV64V-NEXT:    vand.vx v16, v16, a1
+; RV64V-NEXT:    vsll.vi v8, v8, 2
+; RV64V-NEXT:    vor.vv v8, v16, v8
+; RV64V-NEXT:    vsrl.vi v16, v8, 1
+; RV64V-NEXT:    vand.vx v8, v8, a0
+; RV64V-NEXT:    vand.vx v16, v16, a0
+; RV64V-NEXT:    vadd.vv v8, v8, v8
+; RV64V-NEXT:    vor.vv v8, v16, v8
+; RV64V-NEXT:    vsrl.vi v8, v8, 1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 7
+; RV64V-NEXT:    add sp, sp, a0
+; RV64V-NEXT:    addi sp, sp, 16
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC64-LABEL: clmulh_nxv32i16_vv:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    addi sp, sp, -16
+; RV32ZVBC64-NEXT:    csrr a0, vlenb
+; RV32ZVBC64-NEXT:    slli a0, a0, 7
+; RV32ZVBC64-NEXT:    sub sp, sp, a0
+; RV32ZVBC64-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
+; RV32ZVBC64-NEXT:    vsrl.vi v24, v8, 8
+; RV32ZVBC64-NEXT:    vsll.vi v8, v8, 8
+; RV32ZVBC64-NEXT:    lui a3, 1
+; RV32ZVBC64-NEXT:    lui a0, 3
+; RV32ZVBC64-NEXT:    lui a6, 5
+; RV32ZVBC64-NEXT:    vsrl.vi v0, v16, 8
+; RV32ZVBC64-NEXT:    vsll.vi v16, v16, 8
+; RV32ZVBC64-NEXT:    li t4, 16
+; RV32ZVBC64-NEXT:    li a7, 32
+; RV32ZVBC64-NEXT:    li a4, 64
+; RV32ZVBC64-NEXT:    li a5, 128
+; RV32ZVBC64-NEXT:    li t0, 256
+; RV32ZVBC64-NEXT:    li t1, 512
+; RV32ZVBC64-NEXT:    li t2, 1024
+; RV32ZVBC64-NEXT:    li t5, 1
+; RV32ZVBC64-NEXT:    lui t3, 2
+; RV32ZVBC64-NEXT:    addi a2, a3, -241
+; RV32ZVBC64-NEXT:    addi a1, a0, 819
+; RV32ZVBC64-NEXT:    addi a0, a6, 1365
+; RV32ZVBC64-NEXT:    vor.vv v16, v16, v0
+; RV32ZVBC64-NEXT:    vsrl.vi v0, v16, 4
+; RV32ZVBC64-NEXT:    vand.vx v16, v16, a2
+; RV32ZVBC64-NEXT:    vand.vx v0, v0, a2
+; RV32ZVBC64-NEXT:    vsll.vi v16, v16, 4
+; RV32ZVBC64-NEXT:    vor.vv v16, v0, v16
+; RV32ZVBC64-NEXT:    vsrl.vi v0, v16, 2
+; RV32ZVBC64-NEXT:    vand.vx v16, v16, a1
+; RV32ZVBC64-NEXT:    vand.vx v0, v0, a1
+; RV32ZVBC64-NEXT:    vsll.vi v16, v16, 2
+; RV32ZVBC64-NEXT:    vor.vv v16, v0, v16
+; RV32ZVBC64-NEXT:    vsrl.vi v0, v16, 1
+; RV32ZVBC64-NEXT:    vand.vx v16, v16, a0
+; RV32ZVBC64-NEXT:    vand.vx v0, v0, a0
+; RV32ZVBC64-NEXT:    vadd.vv v16, v16, v16
+; RV32ZVBC64-NEXT:    vor.vv v0, v0, v16
+; RV32ZVBC64-NEXT:    vand.vx v16, v0, t4
+; RV32ZVBC64-NEXT:    csrr a6, vlenb
+; RV32ZVBC64-NEXT:    slli a6, a6, 3
+; RV32ZVBC64-NEXT:    mv t4, a6
+; RV32ZVBC64-NEXT:    slli a6, a6, 1
+; RV32ZVBC64-NEXT:    add t4, t4, a6
+; RV32ZVBC64-NEXT:    slli a6, a6, 1
+; RV32ZVBC64-NEXT:    add t4, t4, a6
+; RV32ZVBC64-NEXT:    slli a6, a6, 1
+; RV32ZVBC64-NEXT:    add a6, a6, t4
+; RV32ZVBC64-NEXT:    add a6, sp, a6
+; RV32ZVBC64-NEXT:    addi a6, a6, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    lui a6, 4
+; RV32ZVBC64-NEXT:    vor.vv v8, v8, v24
+; RV32ZVBC64-NEXT:    vsrl.vi v16, v8, 4
+; RV32ZVBC64-NEXT:    vand.vx v8, v8, a2
+; RV32ZVBC64-NEXT:    vand.vx v16, v16, a2
+; RV32ZVBC64-NEXT:    vsll.vi v8, v8, 4
+; RV32ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV32ZVBC64-NEXT:    vsrl.vi v16, v8, 2
+; RV32ZVBC64-NEXT:    vand.vx v8, v8, a1
+; RV32ZVBC64-NEXT:    vand.vx v16, v16, a1
+; RV32ZVBC64-NEXT:    vsll.vi v8, v8, 2
+; RV32ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV32ZVBC64-NEXT:    vsrl.vi v16, v8, 1
+; RV32ZVBC64-NEXT:    vand.vx v8, v8, a0
+; RV32ZVBC64-NEXT:    vand.vx v16, v16, a0
+; RV32ZVBC64-NEXT:    vadd.vv v8, v8, v8
+; RV32ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV32ZVBC64-NEXT:    vand.vx v16, v0, a7
+; RV32ZVBC64-NEXT:    csrr a7, vlenb
+; RV32ZVBC64-NEXT:    slli a7, a7, 4
+; RV32ZVBC64-NEXT:    mv t4, a7
+; RV32ZVBC64-NEXT:    slli a7, a7, 1
+; RV32ZVBC64-NEXT:    add t4, t4, a7
+; RV32ZVBC64-NEXT:    slli a7, a7, 1
+; RV32ZVBC64-NEXT:    add a7, a7, t4
+; RV32ZVBC64-NEXT:    add a7, sp, a7
+; RV32ZVBC64-NEXT:    addi a7, a7, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a7) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    lui a7, 8
+; RV32ZVBC64-NEXT:    slli t5, t5, 11
+; RV32ZVBC64-NEXT:    vand.vx v16, v0, a4
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 3
+; RV32ZVBC64-NEXT:    mv t4, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 2
+; RV32ZVBC64-NEXT:    add t4, t4, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a4, a4, t4
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    vand.vx v16, v0, a5
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 5
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    vand.vx v16, v0, t0
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 3
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a5, a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 2
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    vand.vx v16, v0, t1
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 4
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 2
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    vand.vx v16, v0, t2
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 3
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 3
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    vand.vx v16, v0, t5
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 6
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    vand.vx v16, v0, a3
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 3
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    vand.vx v16, v0, t3
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 4
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    vand.vi v16, v0, 2
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 3
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 2
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    vand.vi v24, v0, 1
+; RV32ZVBC64-NEXT:    vand.vi v16, v0, 4
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 3
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    vand.vi v16, v0, 8
+; RV32ZVBC64-NEXT:    addi a3, sp, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    vand.vx v16, v0, a6
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 5
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    vand.vx v16, v0, a7
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 3
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 3
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 2
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 3
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 2
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    vmul.vv v16, v8, v24
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 3
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v24, v8, v16
+; RV32ZVBC64-NEXT:    addi a3, sp, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v16
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 3
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 3
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 4
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 4
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 3
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 2
+; RV32ZVBC64-NEXT:    add a4, a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 3
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 2
+; RV32ZVBC64-NEXT:    add a4, a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 5
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 5
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 3
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 2
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 3
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 2
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 4
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 2
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 4
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 2
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 3
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 3
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 3
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 3
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 6
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 6
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 3
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 3
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 4
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 4
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 5
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 5
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 3
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v8, v8, v16
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 3
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 3
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 2
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vxor.vv v16, v8, v16
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v0
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 3
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v8
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 4
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v8
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 3
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 2
+; RV32ZVBC64-NEXT:    add a4, a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v8
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 5
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v8
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 3
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 2
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vxor.vv v24, v16, v8
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 4
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 2
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v8
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 3
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 3
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v8
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 6
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v8
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 3
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v8
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 4
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v8
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 5
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v8
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 3
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vxor.vv v8, v24, v8
+; RV32ZVBC64-NEXT:    vsll.vi v16, v16, 8
+; RV32ZVBC64-NEXT:    vsrl.vi v8, v8, 8
+; RV32ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV32ZVBC64-NEXT:    vsrl.vi v16, v8, 4
+; RV32ZVBC64-NEXT:    vand.vx v8, v8, a2
+; RV32ZVBC64-NEXT:    vand.vx v16, v16, a2
+; RV32ZVBC64-NEXT:    vsll.vi v8, v8, 4
+; RV32ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV32ZVBC64-NEXT:    vsrl.vi v16, v8, 2
+; RV32ZVBC64-NEXT:    vand.vx v8, v8, a1
+; RV32ZVBC64-NEXT:    vand.vx v16, v16, a1
+; RV32ZVBC64-NEXT:    vsll.vi v8, v8, 2
+; RV32ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV32ZVBC64-NEXT:    vsrl.vi v16, v8, 1
+; RV32ZVBC64-NEXT:    vand.vx v8, v8, a0
+; RV32ZVBC64-NEXT:    vand.vx v16, v16, a0
+; RV32ZVBC64-NEXT:    vadd.vv v8, v8, v8
+; RV32ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV32ZVBC64-NEXT:    vsrl.vi v8, v8, 1
+; RV32ZVBC64-NEXT:    csrr a0, vlenb
+; RV32ZVBC64-NEXT:    slli a0, a0, 7
+; RV32ZVBC64-NEXT:    add sp, sp, a0
+; RV32ZVBC64-NEXT:    addi sp, sp, 16
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmulh_nxv32i16_vv:
 ; RV64ZVBC64:       # %bb.0:
-; RV64ZVBC64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV64ZVBC64-NEXT:    vzext.vf2 v10, v9
-; RV64ZVBC64-NEXT:    vzext.vf2 v9, v8
-; RV64ZVBC64-NEXT:    vclmul.vv v8, v9, v10
-; RV64ZVBC64-NEXT:    li a0, 32
-; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; RV64ZVBC64-NEXT:    vnsrl.wx v8, v8, a0
+; RV64ZVBC64-NEXT:    addi sp, sp, -16
+; RV64ZVBC64-NEXT:    csrr a0, vlenb
+; RV64ZVBC64-NEXT:    slli a0, a0, 7
+; RV64ZVBC64-NEXT:    sub sp, sp, a0
+; RV64ZVBC64-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
+; RV64ZVBC64-NEXT:    vsrl.vi v24, v8, 8
+; RV64ZVBC64-NEXT:    vsll.vi v8, v8, 8
+; RV64ZVBC64-NEXT:    lui a3, 1
+; RV64ZVBC64-NEXT:    lui a0, 3
+; RV64ZVBC64-NEXT:    lui a6, 5
+; RV64ZVBC64-NEXT:    vsrl.vi v0, v16, 8
+; RV64ZVBC64-NEXT:    vsll.vi v16, v16, 8
+; RV64ZVBC64-NEXT:    li t4, 16
+; RV64ZVBC64-NEXT:    li a7, 32
+; RV64ZVBC64-NEXT:    li a4, 64
+; RV64ZVBC64-NEXT:    li a5, 128
+; RV64ZVBC64-NEXT:    li t0, 256
+; RV64ZVBC64-NEXT:    li t1, 512
+; RV64ZVBC64-NEXT:    li t2, 1024
+; RV64ZVBC64-NEXT:    li t5, 1
+; RV64ZVBC64-NEXT:    lui t3, 2
+; RV64ZVBC64-NEXT:    addi a2, a3, -241
+; RV64ZVBC64-NEXT:    addi a1, a0, 819
+; RV64ZVBC64-NEXT:    addi a0, a6, 1365
+; RV64ZVBC64-NEXT:    vor.vv v16, v16, v0
+; RV64ZVBC64-NEXT:    vsrl.vi v0, v16, 4
+; RV64ZVBC64-NEXT:    vand.vx v16, v16, a2
+; RV64ZVBC64-NEXT:    vand.vx v0, v0, a2
+; RV64ZVBC64-NEXT:    vsll.vi v16, v16, 4
+; RV64ZVBC64-NEXT:    vor.vv v16, v0, v16
+; RV64ZVBC64-NEXT:    vsrl.vi v0, v16, 2
+; RV64ZVBC64-NEXT:    vand.vx v16, v16, a1
+; RV64ZVBC64-NEXT:    vand.vx v0, v0, a1
+; RV64ZVBC64-NEXT:    vsll.vi v16, v16, 2
+; RV64ZVBC64-NEXT:    vor.vv v16, v0, v16
+; RV64ZVBC64-NEXT:    vsrl.vi v0, v16, 1
+; RV64ZVBC64-NEXT:    vand.vx v16, v16, a0
+; RV64ZVBC64-NEXT:    vand.vx v0, v0, a0
+; RV64ZVBC64-NEXT:    vadd.vv v16, v16, v16
+; RV64ZVBC64-NEXT:    vor.vv v0, v0, v16
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, t4
+; RV64ZVBC64-NEXT:    csrr a6, vlenb
+; RV64ZVBC64-NEXT:    slli a6, a6, 3
+; RV64ZVBC64-NEXT:    mv t4, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 1
+; RV64ZVBC64-NEXT:    add t4, t4, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 1
+; RV64ZVBC64-NEXT:    add t4, t4, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 1
+; RV64ZVBC64-NEXT:    add a6, a6, t4
+; RV64ZVBC64-NEXT:    add a6, sp, a6
+; RV64ZVBC64-NEXT:    addi a6, a6, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    lui a6, 4
+; RV64ZVBC64-NEXT:    vor.vv v8, v8, v24
+; RV64ZVBC64-NEXT:    vsrl.vi v16, v8, 4
+; RV64ZVBC64-NEXT:    vand.vx v8, v8, a2
+; RV64ZVBC64-NEXT:    vand.vx v16, v16, a2
+; RV64ZVBC64-NEXT:    vsll.vi v8, v8, 4
+; RV64ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV64ZVBC64-NEXT:    vsrl.vi v16, v8, 2
+; RV64ZVBC64-NEXT:    vand.vx v8, v8, a1
+; RV64ZVBC64-NEXT:    vand.vx v16, v16, a1
+; RV64ZVBC64-NEXT:    vsll.vi v8, v8, 2
+; RV64ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV64ZVBC64-NEXT:    vsrl.vi v16, v8, 1
+; RV64ZVBC64-NEXT:    vand.vx v8, v8, a0
+; RV64ZVBC64-NEXT:    vand.vx v16, v16, a0
+; RV64ZVBC64-NEXT:    vadd.vv v8, v8, v8
+; RV64ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, a7
+; RV64ZVBC64-NEXT:    csrr a7, vlenb
+; RV64ZVBC64-NEXT:    slli a7, a7, 4
+; RV64ZVBC64-NEXT:    mv t4, a7
+; RV64ZVBC64-NEXT:    slli a7, a7, 1
+; RV64ZVBC64-NEXT:    add t4, t4, a7
+; RV64ZVBC64-NEXT:    slli a7, a7, 1
+; RV64ZVBC64-NEXT:    add a7, a7, t4
+; RV64ZVBC64-NEXT:    add a7, sp, a7
+; RV64ZVBC64-NEXT:    addi a7, a7, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a7) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    lui a7, 8
+; RV64ZVBC64-NEXT:    slli t5, t5, 11
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, a4
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv t4, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 2
+; RV64ZVBC64-NEXT:    add t4, t4, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, t4
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, a5
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 5
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, t0
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 2
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, t1
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 4
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 2
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, t2
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, t5
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 6
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, a3
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 3
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, t3
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 4
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vi v16, v0, 2
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 3
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 2
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vi v24, v0, 1
+; RV64ZVBC64-NEXT:    vand.vi v16, v0, 4
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 3
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vi v16, v0, 8
+; RV64ZVBC64-NEXT:    addi a3, sp, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, a6
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 5
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, a7
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 3
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 3
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 2
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 3
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 2
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v24
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 3
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v24, v8, v16
+; RV64ZVBC64-NEXT:    addi a3, sp, 16
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v16
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 3
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 3
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 4
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 4
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 3
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 2
+; RV64ZVBC64-NEXT:    add a4, a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 3
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 2
+; RV64ZVBC64-NEXT:    add a4, a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 5
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 5
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 3
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 2
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 3
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 2
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 4
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 2
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 4
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 2
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 3
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 3
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 3
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 3
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 6
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 6
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 3
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 3
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 4
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 4
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 5
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 5
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 3
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v8, v8, v16
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 3
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 3
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 2
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC64-NEXT:    vxor.vv v16, v16, v0
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 3
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v16, v16, v8
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 4
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v16, v16, v8
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 3
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 2
+; RV64ZVBC64-NEXT:    add a4, a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v16, v16, v8
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 5
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v16, v16, v8
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 3
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 2
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v24, v16, v8
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 4
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 2
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v8
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 3
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 3
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v8
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 6
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v8
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 3
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v8
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 4
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v8
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 5
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v8
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 3
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v8, v24, v8
+; RV64ZVBC64-NEXT:    vsll.vi v16, v16, 8
+; RV64ZVBC64-NEXT:    vsrl.vi v8, v8, 8
+; RV64ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV64ZVBC64-NEXT:    vsrl.vi v16, v8, 4
+; RV64ZVBC64-NEXT:    vand.vx v8, v8, a2
+; RV64ZVBC64-NEXT:    vand.vx v16, v16, a2
+; RV64ZVBC64-NEXT:    vsll.vi v8, v8, 4
+; RV64ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV64ZVBC64-NEXT:    vsrl.vi v16, v8, 2
+; RV64ZVBC64-NEXT:    vand.vx v8, v8, a1
+; RV64ZVBC64-NEXT:    vand.vx v16, v16, a1
+; RV64ZVBC64-NEXT:    vsll.vi v8, v8, 2
+; RV64ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV64ZVBC64-NEXT:    vsrl.vi v16, v8, 1
+; RV64ZVBC64-NEXT:    vand.vx v8, v8, a0
+; RV64ZVBC64-NEXT:    vand.vx v16, v16, a0
+; RV64ZVBC64-NEXT:    vadd.vv v8, v8, v8
+; RV64ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV64ZVBC64-NEXT:    vsrl.vi v8, v8, 1
+; RV64ZVBC64-NEXT:    csrr a0, vlenb
+; RV64ZVBC64-NEXT:    slli a0, a0, 7
+; RV64ZVBC64-NEXT:    add sp, sp, a0
+; RV64ZVBC64-NEXT:    addi sp, sp, 16
 ; RV64ZVBC64-NEXT:    ret
 ;
-; RV32ZVBC32-LABEL: clmulh_nxv1i32_vv:
+; RV32ZVBC32-LABEL: clmulh_nxv32i16_vv:
 ; RV32ZVBC32:       # %bb.0:
-; RV32ZVBC32-NEXT:    addi sp, sp, -16
-; RV32ZVBC32-NEXT:    lui a0, 524288
-; RV32ZVBC32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32ZVBC32-NEXT:    vzext.vf2 v10, v8
-; RV32ZVBC32-NEXT:    vzext.vf2 v8, v9
-; RV32ZVBC32-NEXT:    li a2, 16
-; RV32ZVBC32-NEXT:    li a1, 32
-; RV32ZVBC32-NEXT:    vand.vx v9, v8, a2
-; RV32ZVBC32-NEXT:    li a2, 64
-; RV32ZVBC32-NEXT:    vand.vi v11, v8, 2
-; RV32ZVBC32-NEXT:    vand.vi v12, v8, 1
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vmul.vv v12, v10, v12
-; RV32ZVBC32-NEXT:    vxor.vv v11, v12, v11
-; RV32ZVBC32-NEXT:    vand.vi v12, v8, 4
-; RV32ZVBC32-NEXT:    vmul.vv v12, v10, v12
-; RV32ZVBC32-NEXT:    vxor.vv v11, v11, v12
-; RV32ZVBC32-NEXT:    vand.vi v12, v8, 8
-; RV32ZVBC32-NEXT:    vmul.vv v12, v10, v12
-; RV32ZVBC32-NEXT:    vxor.vv v11, v11, v12
-; RV32ZVBC32-NEXT:    vand.vx v12, v8, a2
-; RV32ZVBC32-NEXT:    li a2, 128
-; RV32ZVBC32-NEXT:    vmul.vv v9, v10, v9
-; RV32ZVBC32-NEXT:    vxor.vv v9, v11, v9
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a1
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a2
-; RV32ZVBC32-NEXT:    li a2, 256
-; RV32ZVBC32-NEXT:    vmul.vv v12, v10, v12
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v12
-; RV32ZVBC32-NEXT:    vand.vx v12, v8, a2
-; RV32ZVBC32-NEXT:    li a2, 512
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a2
-; RV32ZVBC32-NEXT:    li a2, 1024
-; RV32ZVBC32-NEXT:    vmul.vv v12, v10, v12
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v12
-; RV32ZVBC32-NEXT:    vand.vx v12, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 1
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a2
-; RV32ZVBC32-NEXT:    li a2, 1
-; RV32ZVBC32-NEXT:    slli a2, a2, 11
-; RV32ZVBC32-NEXT:    vmul.vv v12, v10, v12
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v12
-; RV32ZVBC32-NEXT:    vand.vx v12, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 2
-; RV32ZVBC32-NEXT:    vmul.vv v12, v10, v12
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v12
-; RV32ZVBC32-NEXT:    vand.vx v12, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 4
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 8
-; RV32ZVBC32-NEXT:    vmul.vv v12, v10, v12
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v12
-; RV32ZVBC32-NEXT:    vand.vx v12, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 16
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 32
-; RV32ZVBC32-NEXT:    vmul.vv v12, v10, v12
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v12
-; RV32ZVBC32-NEXT:    vand.vx v12, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 64
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 128
-; RV32ZVBC32-NEXT:    vmul.vv v12, v10, v12
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v12
-; RV32ZVBC32-NEXT:    vand.vx v12, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 256
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 512
-; RV32ZVBC32-NEXT:    vmul.vv v12, v10, v12
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v12
-; RV32ZVBC32-NEXT:    vand.vx v12, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 1024
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 2048
-; RV32ZVBC32-NEXT:    vmul.vv v12, v10, v12
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v12
-; RV32ZVBC32-NEXT:    vand.vx v12, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 4096
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 8192
-; RV32ZVBC32-NEXT:    vmul.vv v12, v10, v12
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v12
-; RV32ZVBC32-NEXT:    vand.vx v12, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 16384
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 32768
-; RV32ZVBC32-NEXT:    vmul.vv v12, v10, v12
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v12
-; RV32ZVBC32-NEXT:    vand.vx v12, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 65536
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 131072
-; RV32ZVBC32-NEXT:    vmul.vv v12, v10, v12
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v12
-; RV32ZVBC32-NEXT:    vand.vx v12, v8, a2
-; RV32ZVBC32-NEXT:    addi a2, sp, 8
-; RV32ZVBC32-NEXT:    sw a0, 8(sp)
-; RV32ZVBC32-NEXT:    sw zero, 12(sp)
-; RV32ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV32ZVBC32-NEXT:    vlse64.v v11, (a2), zero
-; RV32ZVBC32-NEXT:    vmul.vv v12, v10, v12
-; RV32ZVBC32-NEXT:    vxor.vv v9, v9, v12
-; RV32ZVBC32-NEXT:    lui a0, 262144
-; RV32ZVBC32-NEXT:    vand.vv v11, v8, v11
-; RV32ZVBC32-NEXT:    vand.vx v8, v8, a0
-; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
-; RV32ZVBC32-NEXT:    vxor.vv v8, v9, v8
-; RV32ZVBC32-NEXT:    vmul.vv v9, v10, v11
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v9
-; RV32ZVBC32-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; RV32ZVBC32-NEXT:    vnsrl.wx v8, v8, a1
-; RV32ZVBC32-NEXT:    addi sp, sp, 16
+; RV32ZVBC32-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
+; RV32ZVBC32-NEXT:    vclmulh.vv v8, v8, v16
 ; RV32ZVBC32-NEXT:    ret
 ;
-; RV64ZVBC32-LABEL: clmulh_nxv1i32_vv:
+; RV64ZVBC32-LABEL: clmulh_nxv32i16_vv:
 ; RV64ZVBC32:       # %bb.0:
-; RV64ZVBC32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV64ZVBC32-NEXT:    vzext.vf2 v10, v8
-; RV64ZVBC32-NEXT:    vzext.vf2 v8, v9
-; RV64ZVBC32-NEXT:    li a1, 16
-; RV64ZVBC32-NEXT:    li a0, 32
-; RV64ZVBC32-NEXT:    li a2, 64
-; RV64ZVBC32-NEXT:    vand.vi v9, v8, 2
-; RV64ZVBC32-NEXT:    vand.vi v11, v8, 1
-; RV64ZVBC32-NEXT:    vmul.vv v9, v10, v9
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v11, v9
-; RV64ZVBC32-NEXT:    vand.vi v11, v8, 4
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vi v11, v8, 8
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a1
-; RV64ZVBC32-NEXT:    li a1, 128
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
-; RV64ZVBC32-NEXT:    li a2, 256
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a1
-; RV64ZVBC32-NEXT:    li a1, 512
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
-; RV64ZVBC32-NEXT:    li a2, 1024
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a1
-; RV64ZVBC32-NEXT:    li a1, 1
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
-; RV64ZVBC32-NEXT:    slli a2, a1, 11
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 1
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 2
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 4
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 8
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 16
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 32
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 64
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 128
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 256
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 512
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 1024
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 2048
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 4096
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 8192
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 16384
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 32768
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 65536
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 131072
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 262144
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
-; RV64ZVBC32-NEXT:    slli a1, a1, 31
-; RV64ZVBC32-NEXT:    vmul.vv v11, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v8, v8, a1
-; RV64ZVBC32-NEXT:    vxor.vv v9, v9, v11
-; RV64ZVBC32-NEXT:    vmul.vv v8, v10, v8
-; RV64ZVBC32-NEXT:    vxor.vv v8, v9, v8
-; RV64ZVBC32-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; RV64ZVBC32-NEXT:    vnsrl.wx v8, v8, a0
+; RV64ZVBC32-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
+; RV64ZVBC32-NEXT:    vclmulh.vv v8, v8, v16
 ; RV64ZVBC32-NEXT:    ret
-  %va.ext = zext <vscale x 1 x i32> %va to <vscale x 1 x i64>
-  %vb.ext = zext <vscale x 1 x i32> %vb to <vscale x 1 x i64>
-  %clmul = call <vscale x 1 x i64> @llvm.clmul.nxv1i64(<vscale x 1 x i64> %va.ext, <vscale x 1 x i64> %vb.ext)
-  %res.ext = lshr <vscale x 1 x i64> %clmul, splat(i64 32)
-  %res = trunc <vscale x 1 x i64> %res.ext to <vscale x 1 x i32>
-  ret <vscale x 1 x i32> %res
+  %va.ext = zext <vscale x 32 x i16> %va to <vscale x 32 x i32>
+  %vb.ext = zext <vscale x 32 x i16> %vb to <vscale x 32 x i32>
+  %clmul = call <vscale x 32 x i32> @llvm.clmul.nxv32i32(<vscale x 32 x i32> %va.ext, <vscale x 32 x i32> %vb.ext)
+  %res.ext = lshr <vscale x 32 x i32> %clmul, splat(i32 16)
+  %res = trunc <vscale x 32 x i32> %res.ext to <vscale x 32 x i16>
+  ret <vscale x 32 x i16> %res
 }
 
-define <vscale x 1 x i32> @clmulh_nxv1i32_vx(<vscale x 1 x i32> %va, i32 %b) nounwind {
-; RV32V-LABEL: clmulh_nxv1i32_vx:
+define <vscale x 32 x i16> @clmulh_nxv32i16_vx(<vscale x 32 x i16> %va, i16 %b) nounwind {
+; RV32V-LABEL: clmulh_nxv32i16_vx:
 ; RV32V:       # %bb.0:
 ; RV32V-NEXT:    addi sp, sp, -16
-; RV32V-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
-; RV32V-NEXT:    vmv.v.x v10, a0
-; RV32V-NEXT:    lui a0, 524288
-; RV32V-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
-; RV32V-NEXT:    vzext.vf2 v9, v8
-; RV32V-NEXT:    li a2, 16
-; RV32V-NEXT:    li a1, 32
-; RV32V-NEXT:    vzext.vf2 v8, v10
-; RV32V-NEXT:    vand.vx v10, v8, a2
-; RV32V-NEXT:    li a2, 64
-; RV32V-NEXT:    vand.vi v11, v8, 2
-; RV32V-NEXT:    vand.vi v12, v8, 1
-; RV32V-NEXT:    vmul.vv v11, v9, v11
-; RV32V-NEXT:    vmul.vv v12, v9, v12
-; RV32V-NEXT:    vxor.vv v11, v12, v11
-; RV32V-NEXT:    vand.vi v12, v8, 4
-; RV32V-NEXT:    vmul.vv v12, v9, v12
-; RV32V-NEXT:    vxor.vv v11, v11, v12
-; RV32V-NEXT:    vand.vi v12, v8, 8
-; RV32V-NEXT:    vmul.vv v12, v9, v12
-; RV32V-NEXT:    vxor.vv v11, v11, v12
-; RV32V-NEXT:    vand.vx v12, v8, a2
-; RV32V-NEXT:    li a2, 128
-; RV32V-NEXT:    vmul.vv v10, v9, v10
-; RV32V-NEXT:    vxor.vv v10, v11, v10
-; RV32V-NEXT:    vand.vx v11, v8, a1
-; RV32V-NEXT:    vmul.vv v11, v9, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v8, a2
-; RV32V-NEXT:    li a2, 256
-; RV32V-NEXT:    vmul.vv v12, v9, v12
-; RV32V-NEXT:    vxor.vv v10, v10, v12
-; RV32V-NEXT:    vand.vx v12, v8, a2
-; RV32V-NEXT:    li a2, 512
-; RV32V-NEXT:    vmul.vv v11, v9, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v8, a2
-; RV32V-NEXT:    li a2, 1024
-; RV32V-NEXT:    vmul.vv v12, v9, v12
-; RV32V-NEXT:    vxor.vv v10, v10, v12
-; RV32V-NEXT:    vand.vx v12, v8, a2
-; RV32V-NEXT:    li a2, 1
-; RV32V-NEXT:    slli a2, a2, 11
-; RV32V-NEXT:    vmul.vv v11, v9, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v8, a2
-; RV32V-NEXT:    lui a2, 1
-; RV32V-NEXT:    vmul.vv v12, v9, v12
-; RV32V-NEXT:    vxor.vv v10, v10, v12
-; RV32V-NEXT:    vand.vx v12, v8, a2
-; RV32V-NEXT:    lui a2, 2
-; RV32V-NEXT:    vmul.vv v11, v9, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v8, a2
-; RV32V-NEXT:    lui a2, 4
-; RV32V-NEXT:    vmul.vv v12, v9, v12
-; RV32V-NEXT:    vxor.vv v10, v10, v12
-; RV32V-NEXT:    vand.vx v12, v8, a2
-; RV32V-NEXT:    lui a2, 8
-; RV32V-NEXT:    vmul.vv v11, v9, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v8, a2
-; RV32V-NEXT:    lui a2, 16
-; RV32V-NEXT:    vmul.vv v12, v9, v12
-; RV32V-NEXT:    vxor.vv v10, v10, v12
-; RV32V-NEXT:    vand.vx v12, v8, a2
-; RV32V-NEXT:    lui a2, 32
-; RV32V-NEXT:    vmul.vv v11, v9, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v8, a2
-; RV32V-NEXT:    lui a2, 64
-; RV32V-NEXT:    vmul.vv v12, v9, v12
-; RV32V-NEXT:    vxor.vv v10, v10, v12
-; RV32V-NEXT:    vand.vx v12, v8, a2
-; RV32V-NEXT:    lui a2, 128
-; RV32V-NEXT:    vmul.vv v11, v9, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v8, a2
-; RV32V-NEXT:    lui a2, 256
-; RV32V-NEXT:    vmul.vv v12, v9, v12
-; RV32V-NEXT:    vxor.vv v10, v10, v12
-; RV32V-NEXT:    vand.vx v12, v8, a2
-; RV32V-NEXT:    lui a2, 512
-; RV32V-NEXT:    vmul.vv v11, v9, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v8, a2
-; RV32V-NEXT:    lui a2, 1024
-; RV32V-NEXT:    vmul.vv v12, v9, v12
-; RV32V-NEXT:    vxor.vv v10, v10, v12
-; RV32V-NEXT:    vand.vx v12, v8, a2
-; RV32V-NEXT:    lui a2, 2048
-; RV32V-NEXT:    vmul.vv v11, v9, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v8, a2
-; RV32V-NEXT:    lui a2, 4096
-; RV32V-NEXT:    vmul.vv v12, v9, v12
-; RV32V-NEXT:    vxor.vv v10, v10, v12
-; RV32V-NEXT:    vand.vx v12, v8, a2
-; RV32V-NEXT:    lui a2, 8192
-; RV32V-NEXT:    vmul.vv v11, v9, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v8, a2
-; RV32V-NEXT:    lui a2, 16384
-; RV32V-NEXT:    vmul.vv v12, v9, v12
-; RV32V-NEXT:    vxor.vv v10, v10, v12
-; RV32V-NEXT:    vand.vx v12, v8, a2
-; RV32V-NEXT:    lui a2, 32768
-; RV32V-NEXT:    vmul.vv v11, v9, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v8, a2
-; RV32V-NEXT:    lui a2, 65536
-; RV32V-NEXT:    vmul.vv v12, v9, v12
-; RV32V-NEXT:    vxor.vv v10, v10, v12
-; RV32V-NEXT:    vand.vx v12, v8, a2
-; RV32V-NEXT:    lui a2, 131072
-; RV32V-NEXT:    vmul.vv v11, v9, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    vand.vx v11, v8, a2
-; RV32V-NEXT:    addi a2, sp, 8
-; RV32V-NEXT:    sw a0, 8(sp)
-; RV32V-NEXT:    sw zero, 12(sp)
-; RV32V-NEXT:    vmul.vv v12, v9, v12
-; RV32V-NEXT:    vxor.vv v10, v10, v12
-; RV32V-NEXT:    vlse64.v v12, (a2), zero
-; RV32V-NEXT:    vmul.vv v11, v9, v11
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    lui a0, 262144
-; RV32V-NEXT:    vand.vv v11, v8, v12
+; RV32V-NEXT:    csrr a1, vlenb
+; RV32V-NEXT:    slli a1, a1, 7
+; RV32V-NEXT:    sub sp, sp, a1
+; RV32V-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; RV32V-NEXT:    vmv.v.x v24, a0
+; RV32V-NEXT:    vsrl.vi v16, v8, 8
+; RV32V-NEXT:    vsll.vi v8, v8, 8
+; RV32V-NEXT:    lui a3, 1
+; RV32V-NEXT:    lui a0, 3
+; RV32V-NEXT:    lui a7, 5
+; RV32V-NEXT:    li t4, 16
+; RV32V-NEXT:    li a6, 32
+; RV32V-NEXT:    li a4, 64
+; RV32V-NEXT:    li a5, 128
+; RV32V-NEXT:    li t0, 256
+; RV32V-NEXT:    li t1, 512
+; RV32V-NEXT:    li t2, 1024
+; RV32V-NEXT:    li t5, 1
+; RV32V-NEXT:    lui t3, 2
+; RV32V-NEXT:    addi a2, a3, -241
+; RV32V-NEXT:    addi a1, a0, 819
+; RV32V-NEXT:    addi a0, a7, 1365
+; RV32V-NEXT:    vsrl.vi v0, v24, 8
+; RV32V-NEXT:    vsll.vi v24, v24, 8
+; RV32V-NEXT:    vor.vv v24, v24, v0
+; RV32V-NEXT:    vsrl.vi v0, v24, 4
+; RV32V-NEXT:    vand.vx v24, v24, a2
+; RV32V-NEXT:    vand.vx v0, v0, a2
+; RV32V-NEXT:    vsll.vi v24, v24, 4
+; RV32V-NEXT:    vor.vv v24, v0, v24
+; RV32V-NEXT:    vsrl.vi v0, v24, 2
+; RV32V-NEXT:    vand.vx v24, v24, a1
+; RV32V-NEXT:    vand.vx v0, v0, a1
+; RV32V-NEXT:    vsll.vi v24, v24, 2
+; RV32V-NEXT:    vor.vv v24, v0, v24
+; RV32V-NEXT:    vsrl.vi v0, v24, 1
+; RV32V-NEXT:    vand.vx v24, v24, a0
+; RV32V-NEXT:    vand.vx v0, v0, a0
+; RV32V-NEXT:    vadd.vv v24, v24, v24
+; RV32V-NEXT:    vor.vv v0, v0, v24
+; RV32V-NEXT:    vand.vx v24, v0, t4
+; RV32V-NEXT:    csrr a7, vlenb
+; RV32V-NEXT:    slli a7, a7, 3
+; RV32V-NEXT:    mv t4, a7
+; RV32V-NEXT:    slli a7, a7, 1
+; RV32V-NEXT:    add t4, t4, a7
+; RV32V-NEXT:    slli a7, a7, 1
+; RV32V-NEXT:    add t4, t4, a7
+; RV32V-NEXT:    slli a7, a7, 1
+; RV32V-NEXT:    add a7, a7, t4
+; RV32V-NEXT:    add a7, sp, a7
+; RV32V-NEXT:    addi a7, a7, 16
+; RV32V-NEXT:    vs8r.v v24, (a7) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    lui a7, 4
+; RV32V-NEXT:    vor.vv v8, v8, v16
+; RV32V-NEXT:    vsrl.vi v16, v8, 4
+; RV32V-NEXT:    vand.vx v8, v8, a2
+; RV32V-NEXT:    vand.vx v16, v16, a2
+; RV32V-NEXT:    vsll.vi v8, v8, 4
+; RV32V-NEXT:    vor.vv v8, v16, v8
+; RV32V-NEXT:    vsrl.vi v16, v8, 2
+; RV32V-NEXT:    vand.vx v8, v8, a1
+; RV32V-NEXT:    vand.vx v16, v16, a1
+; RV32V-NEXT:    vsll.vi v8, v8, 2
+; RV32V-NEXT:    vor.vv v8, v16, v8
+; RV32V-NEXT:    vsrl.vi v16, v8, 1
 ; RV32V-NEXT:    vand.vx v8, v8, a0
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    vxor.vv v8, v10, v8
-; RV32V-NEXT:    vmul.vv v9, v9, v11
-; RV32V-NEXT:    vxor.vv v8, v8, v9
-; RV32V-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; RV32V-NEXT:    vnsrl.wx v8, v8, a1
+; RV32V-NEXT:    vand.vx v16, v16, a0
+; RV32V-NEXT:    vadd.vv v8, v8, v8
+; RV32V-NEXT:    vor.vv v8, v16, v8
+; RV32V-NEXT:    vand.vx v16, v0, a6
+; RV32V-NEXT:    csrr a6, vlenb
+; RV32V-NEXT:    slli a6, a6, 4
+; RV32V-NEXT:    mv t4, a6
+; RV32V-NEXT:    slli a6, a6, 1
+; RV32V-NEXT:    add t4, t4, a6
+; RV32V-NEXT:    slli a6, a6, 1
+; RV32V-NEXT:    add a6, a6, t4
+; RV32V-NEXT:    add a6, sp, a6
+; RV32V-NEXT:    addi a6, a6, 16
+; RV32V-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    lui a6, 8
+; RV32V-NEXT:    slli t5, t5, 11
+; RV32V-NEXT:    vand.vx v16, v0, a4
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    mv t4, a4
+; RV32V-NEXT:    slli a4, a4, 2
+; RV32V-NEXT:    add t4, t4, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a4, a4, t4
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v16, v0, a5
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 5
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v16, v0, t0
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a5, a5, a4
+; RV32V-NEXT:    slli a4, a4, 2
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v16, v0, t1
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 4
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 2
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v16, v0, t2
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v16, v0, t5
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 6
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v16, v0, a3
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 3
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a4, a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v16, v0, t3
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 4
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vi v16, v0, 2
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 3
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 2
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vi v24, v0, 1
+; RV32V-NEXT:    vand.vi v16, v0, 4
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 3
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vi v16, v0, 8
+; RV32V-NEXT:    addi a3, sp, 16
+; RV32V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v16, v0, a7
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 5
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v16, v0, a6
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 3
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 3
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 2
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 3
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 2
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vmul.vv v16, v8, v24
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 3
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v24, v8, v16
+; RV32V-NEXT:    addi a3, sp, 16
+; RV32V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v0, v8, v16
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 3
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a4, a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a4, a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 3
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a4, a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a4, a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 4
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a4, a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 4
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a4, a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 3
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 2
+; RV32V-NEXT:    add a4, a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 3
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 2
+; RV32V-NEXT:    add a4, a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 5
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 5
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 3
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a4, a4, a3
+; RV32V-NEXT:    slli a3, a3, 2
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 3
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a4, a4, a3
+; RV32V-NEXT:    slli a3, a3, 2
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 4
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 2
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 4
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 2
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 3
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 3
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 3
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 3
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 6
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 6
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 3
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a4, a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 3
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a4, a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 4
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 4
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 5
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 5
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 3
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v8, v16
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 3
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 3
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 2
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v8, v16
+; RV32V-NEXT:    vxor.vv v16, v16, v24
+; RV32V-NEXT:    vxor.vv v16, v16, v0
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 3
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a4, a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a4, a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v8
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 4
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a4, a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v8
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 3
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 2
+; RV32V-NEXT:    add a4, a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v8
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 5
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v8
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 3
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a4, a4, a3
+; RV32V-NEXT:    slli a3, a3, 2
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v24, v16, v8
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 4
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 2
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v24, v24, v8
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 3
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 3
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v24, v24, v8
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 6
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v24, v24, v8
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 3
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a4, a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v24, v24, v8
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 4
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v24, v24, v8
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 5
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v24, v24, v8
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 3
+; RV32V-NEXT:    mv a4, a3
+; RV32V-NEXT:    slli a3, a3, 1
+; RV32V-NEXT:    add a3, a3, a4
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v24, v8
+; RV32V-NEXT:    vsll.vi v16, v16, 8
+; RV32V-NEXT:    vsrl.vi v8, v8, 8
+; RV32V-NEXT:    vor.vv v8, v16, v8
+; RV32V-NEXT:    vsrl.vi v16, v8, 4
+; RV32V-NEXT:    vand.vx v8, v8, a2
+; RV32V-NEXT:    vand.vx v16, v16, a2
+; RV32V-NEXT:    vsll.vi v8, v8, 4
+; RV32V-NEXT:    vor.vv v8, v16, v8
+; RV32V-NEXT:    vsrl.vi v16, v8, 2
+; RV32V-NEXT:    vand.vx v8, v8, a1
+; RV32V-NEXT:    vand.vx v16, v16, a1
+; RV32V-NEXT:    vsll.vi v8, v8, 2
+; RV32V-NEXT:    vor.vv v8, v16, v8
+; RV32V-NEXT:    vsrl.vi v16, v8, 1
+; RV32V-NEXT:    vand.vx v8, v8, a0
+; RV32V-NEXT:    vand.vx v16, v16, a0
+; RV32V-NEXT:    vadd.vv v8, v8, v8
+; RV32V-NEXT:    vor.vv v8, v16, v8
+; RV32V-NEXT:    vsrl.vi v8, v8, 1
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 7
+; RV32V-NEXT:    add sp, sp, a0
 ; RV32V-NEXT:    addi sp, sp, 16
 ; RV32V-NEXT:    ret
 ;
-; RV64V-LABEL: clmulh_nxv1i32_vx:
+; RV64V-LABEL: clmulh_nxv32i16_vx:
 ; RV64V:       # %bb.0:
-; RV64V-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
-; RV64V-NEXT:    vmv.v.x v10, a0
-; RV64V-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
-; RV64V-NEXT:    vzext.vf2 v9, v8
-; RV64V-NEXT:    li a1, 16
-; RV64V-NEXT:    li a0, 32
-; RV64V-NEXT:    li a2, 64
-; RV64V-NEXT:    vzext.vf2 v8, v10
-; RV64V-NEXT:    vand.vi v10, v8, 2
-; RV64V-NEXT:    vand.vi v11, v8, 1
-; RV64V-NEXT:    vmul.vv v10, v9, v10
-; RV64V-NEXT:    vmul.vv v11, v9, v11
-; RV64V-NEXT:    vxor.vv v10, v11, v10
-; RV64V-NEXT:    vand.vi v11, v8, 4
-; RV64V-NEXT:    vmul.vv v11, v9, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vi v11, v8, 8
-; RV64V-NEXT:    vmul.vv v11, v9, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v8, a1
-; RV64V-NEXT:    li a1, 128
-; RV64V-NEXT:    vmul.vv v11, v9, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v8, a0
-; RV64V-NEXT:    vmul.vv v11, v9, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v8, a2
-; RV64V-NEXT:    li a2, 256
-; RV64V-NEXT:    vmul.vv v11, v9, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v8, a1
-; RV64V-NEXT:    li a1, 512
-; RV64V-NEXT:    vmul.vv v11, v9, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v8, a2
-; RV64V-NEXT:    li a2, 1024
-; RV64V-NEXT:    vmul.vv v11, v9, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v8, a1
-; RV64V-NEXT:    li a1, 1
-; RV64V-NEXT:    vmul.vv v11, v9, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v8, a2
-; RV64V-NEXT:    slli a2, a1, 11
-; RV64V-NEXT:    vmul.vv v11, v9, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v8, a2
-; RV64V-NEXT:    lui a2, 1
-; RV64V-NEXT:    vmul.vv v11, v9, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v8, a2
-; RV64V-NEXT:    lui a2, 2
-; RV64V-NEXT:    vmul.vv v11, v9, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v8, a2
-; RV64V-NEXT:    lui a2, 4
-; RV64V-NEXT:    vmul.vv v11, v9, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v8, a2
-; RV64V-NEXT:    lui a2, 8
-; RV64V-NEXT:    vmul.vv v11, v9, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v8, a2
-; RV64V-NEXT:    lui a2, 16
-; RV64V-NEXT:    vmul.vv v11, v9, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v8, a2
-; RV64V-NEXT:    lui a2, 32
-; RV64V-NEXT:    vmul.vv v11, v9, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v8, a2
-; RV64V-NEXT:    lui a2, 64
-; RV64V-NEXT:    vmul.vv v11, v9, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v8, a2
-; RV64V-NEXT:    lui a2, 128
-; RV64V-NEXT:    vmul.vv v11, v9, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v8, a2
-; RV64V-NEXT:    lui a2, 256
-; RV64V-NEXT:    vmul.vv v11, v9, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v8, a2
-; RV64V-NEXT:    lui a2, 512
-; RV64V-NEXT:    vmul.vv v11, v9, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v8, a2
-; RV64V-NEXT:    lui a2, 1024
-; RV64V-NEXT:    vmul.vv v11, v9, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v8, a2
-; RV64V-NEXT:    lui a2, 2048
-; RV64V-NEXT:    vmul.vv v11, v9, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v8, a2
-; RV64V-NEXT:    lui a2, 4096
-; RV64V-NEXT:    vmul.vv v11, v9, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v8, a2
-; RV64V-NEXT:    lui a2, 8192
-; RV64V-NEXT:    vmul.vv v11, v9, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v8, a2
-; RV64V-NEXT:    lui a2, 16384
-; RV64V-NEXT:    vmul.vv v11, v9, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v8, a2
-; RV64V-NEXT:    lui a2, 32768
-; RV64V-NEXT:    vmul.vv v11, v9, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v8, a2
-; RV64V-NEXT:    lui a2, 65536
-; RV64V-NEXT:    vmul.vv v11, v9, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v8, a2
-; RV64V-NEXT:    lui a2, 131072
-; RV64V-NEXT:    vmul.vv v11, v9, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v8, a2
-; RV64V-NEXT:    lui a2, 262144
-; RV64V-NEXT:    slli a1, a1, 31
-; RV64V-NEXT:    vmul.vv v11, v9, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vand.vx v11, v8, a2
+; RV64V-NEXT:    addi sp, sp, -16
+; RV64V-NEXT:    csrr a1, vlenb
+; RV64V-NEXT:    slli a1, a1, 7
+; RV64V-NEXT:    sub sp, sp, a1
+; RV64V-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; RV64V-NEXT:    vmv.v.x v24, a0
+; RV64V-NEXT:    vsrl.vi v16, v8, 8
+; RV64V-NEXT:    vsll.vi v8, v8, 8
+; RV64V-NEXT:    lui a3, 1
+; RV64V-NEXT:    lui a0, 3
+; RV64V-NEXT:    lui a7, 5
+; RV64V-NEXT:    li t4, 16
+; RV64V-NEXT:    li a6, 32
+; RV64V-NEXT:    li a4, 64
+; RV64V-NEXT:    li a5, 128
+; RV64V-NEXT:    li t0, 256
+; RV64V-NEXT:    li t1, 512
+; RV64V-NEXT:    li t2, 1024
+; RV64V-NEXT:    li t5, 1
+; RV64V-NEXT:    lui t3, 2
+; RV64V-NEXT:    addi a2, a3, -241
+; RV64V-NEXT:    addi a1, a0, 819
+; RV64V-NEXT:    addi a0, a7, 1365
+; RV64V-NEXT:    vsrl.vi v0, v24, 8
+; RV64V-NEXT:    vsll.vi v24, v24, 8
+; RV64V-NEXT:    vor.vv v24, v24, v0
+; RV64V-NEXT:    vsrl.vi v0, v24, 4
+; RV64V-NEXT:    vand.vx v24, v24, a2
+; RV64V-NEXT:    vand.vx v0, v0, a2
+; RV64V-NEXT:    vsll.vi v24, v24, 4
+; RV64V-NEXT:    vor.vv v24, v0, v24
+; RV64V-NEXT:    vsrl.vi v0, v24, 2
+; RV64V-NEXT:    vand.vx v24, v24, a1
+; RV64V-NEXT:    vand.vx v0, v0, a1
+; RV64V-NEXT:    vsll.vi v24, v24, 2
+; RV64V-NEXT:    vor.vv v24, v0, v24
+; RV64V-NEXT:    vsrl.vi v0, v24, 1
+; RV64V-NEXT:    vand.vx v24, v24, a0
+; RV64V-NEXT:    vand.vx v0, v0, a0
+; RV64V-NEXT:    vadd.vv v24, v24, v24
+; RV64V-NEXT:    vor.vv v0, v0, v24
+; RV64V-NEXT:    vand.vx v24, v0, t4
+; RV64V-NEXT:    csrr a7, vlenb
+; RV64V-NEXT:    slli a7, a7, 3
+; RV64V-NEXT:    mv t4, a7
+; RV64V-NEXT:    slli a7, a7, 1
+; RV64V-NEXT:    add t4, t4, a7
+; RV64V-NEXT:    slli a7, a7, 1
+; RV64V-NEXT:    add t4, t4, a7
+; RV64V-NEXT:    slli a7, a7, 1
+; RV64V-NEXT:    add a7, a7, t4
+; RV64V-NEXT:    add a7, sp, a7
+; RV64V-NEXT:    addi a7, a7, 16
+; RV64V-NEXT:    vs8r.v v24, (a7) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    lui a7, 4
+; RV64V-NEXT:    vor.vv v8, v8, v16
+; RV64V-NEXT:    vsrl.vi v16, v8, 4
+; RV64V-NEXT:    vand.vx v8, v8, a2
+; RV64V-NEXT:    vand.vx v16, v16, a2
+; RV64V-NEXT:    vsll.vi v8, v8, 4
+; RV64V-NEXT:    vor.vv v8, v16, v8
+; RV64V-NEXT:    vsrl.vi v16, v8, 2
 ; RV64V-NEXT:    vand.vx v8, v8, a1
-; RV64V-NEXT:    vmul.vv v11, v9, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vmul.vv v8, v9, v8
-; RV64V-NEXT:    vxor.vv v8, v10, v8
-; RV64V-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; RV64V-NEXT:    vnsrl.wx v8, v8, a0
-; RV64V-NEXT:    ret
-;
-; RV32ZVBC64-LABEL: clmulh_nxv1i32_vx:
-; RV32ZVBC64:       # %bb.0:
-; RV32ZVBC64-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
-; RV32ZVBC64-NEXT:    vmv.v.x v9, a0
-; RV32ZVBC64-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
-; RV32ZVBC64-NEXT:    vzext.vf2 v10, v8
-; RV32ZVBC64-NEXT:    vzext.vf2 v8, v9
-; RV32ZVBC64-NEXT:    vclmul.vv v8, v10, v8
-; RV32ZVBC64-NEXT:    li a0, 32
-; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; RV32ZVBC64-NEXT:    vnsrl.wx v8, v8, a0
-; RV32ZVBC64-NEXT:    ret
-;
-; RV64ZVBC64-LABEL: clmulh_nxv1i32_vx:
-; RV64ZVBC64:       # %bb.0:
-; RV64ZVBC64-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
-; RV64ZVBC64-NEXT:    vmv.v.x v9, a0
-; RV64ZVBC64-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
-; RV64ZVBC64-NEXT:    vzext.vf2 v10, v8
-; RV64ZVBC64-NEXT:    vzext.vf2 v8, v9
-; RV64ZVBC64-NEXT:    vclmul.vv v8, v10, v8
-; RV64ZVBC64-NEXT:    li a0, 32
-; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; RV64ZVBC64-NEXT:    vnsrl.wx v8, v8, a0
-; RV64ZVBC64-NEXT:    ret
-;
-; RV32ZVBC32-LABEL: clmulh_nxv1i32_vx:
-; RV32ZVBC32:       # %bb.0:
-; RV32ZVBC32-NEXT:    addi sp, sp, -16
-; RV32ZVBC32-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
-; RV32ZVBC32-NEXT:    vmv.v.x v10, a0
-; RV32ZVBC32-NEXT:    lui a0, 524288
-; RV32ZVBC32-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
-; RV32ZVBC32-NEXT:    vzext.vf2 v9, v8
-; RV32ZVBC32-NEXT:    li a2, 16
-; RV32ZVBC32-NEXT:    li a1, 32
-; RV32ZVBC32-NEXT:    vzext.vf2 v8, v10
-; RV32ZVBC32-NEXT:    vand.vx v10, v8, a2
-; RV32ZVBC32-NEXT:    li a2, 64
-; RV32ZVBC32-NEXT:    vand.vi v11, v8, 2
-; RV32ZVBC32-NEXT:    vand.vi v12, v8, 1
-; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV32ZVBC32-NEXT:    vmul.vv v12, v9, v12
-; RV32ZVBC32-NEXT:    vxor.vv v11, v12, v11
-; RV32ZVBC32-NEXT:    vand.vi v12, v8, 4
-; RV32ZVBC32-NEXT:    vmul.vv v12, v9, v12
-; RV32ZVBC32-NEXT:    vxor.vv v11, v11, v12
-; RV32ZVBC32-NEXT:    vand.vi v12, v8, 8
-; RV32ZVBC32-NEXT:    vmul.vv v12, v9, v12
-; RV32ZVBC32-NEXT:    vxor.vv v11, v11, v12
-; RV32ZVBC32-NEXT:    vand.vx v12, v8, a2
-; RV32ZVBC32-NEXT:    li a2, 128
-; RV32ZVBC32-NEXT:    vmul.vv v10, v9, v10
-; RV32ZVBC32-NEXT:    vxor.vv v10, v11, v10
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a1
-; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a2
-; RV32ZVBC32-NEXT:    li a2, 256
-; RV32ZVBC32-NEXT:    vmul.vv v12, v9, v12
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
-; RV32ZVBC32-NEXT:    vand.vx v12, v8, a2
-; RV32ZVBC32-NEXT:    li a2, 512
-; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a2
-; RV32ZVBC32-NEXT:    li a2, 1024
-; RV32ZVBC32-NEXT:    vmul.vv v12, v9, v12
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
-; RV32ZVBC32-NEXT:    vand.vx v12, v8, a2
-; RV32ZVBC32-NEXT:    li a2, 1
-; RV32ZVBC32-NEXT:    slli a2, a2, 11
-; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 1
-; RV32ZVBC32-NEXT:    vmul.vv v12, v9, v12
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
-; RV32ZVBC32-NEXT:    vand.vx v12, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 2
-; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 4
-; RV32ZVBC32-NEXT:    vmul.vv v12, v9, v12
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
-; RV32ZVBC32-NEXT:    vand.vx v12, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 8
-; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 16
-; RV32ZVBC32-NEXT:    vmul.vv v12, v9, v12
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
-; RV32ZVBC32-NEXT:    vand.vx v12, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 32
-; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 64
-; RV32ZVBC32-NEXT:    vmul.vv v12, v9, v12
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
-; RV32ZVBC32-NEXT:    vand.vx v12, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 128
-; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 256
-; RV32ZVBC32-NEXT:    vmul.vv v12, v9, v12
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
-; RV32ZVBC32-NEXT:    vand.vx v12, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 512
-; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 1024
-; RV32ZVBC32-NEXT:    vmul.vv v12, v9, v12
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
-; RV32ZVBC32-NEXT:    vand.vx v12, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 2048
-; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 4096
-; RV32ZVBC32-NEXT:    vmul.vv v12, v9, v12
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
-; RV32ZVBC32-NEXT:    vand.vx v12, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 8192
-; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 16384
-; RV32ZVBC32-NEXT:    vmul.vv v12, v9, v12
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
-; RV32ZVBC32-NEXT:    vand.vx v12, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 32768
-; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 65536
-; RV32ZVBC32-NEXT:    vmul.vv v12, v9, v12
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
-; RV32ZVBC32-NEXT:    vand.vx v12, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 131072
-; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    vand.vx v11, v8, a2
-; RV32ZVBC32-NEXT:    addi a2, sp, 8
-; RV32ZVBC32-NEXT:    sw a0, 8(sp)
-; RV32ZVBC32-NEXT:    sw zero, 12(sp)
-; RV32ZVBC32-NEXT:    vmul.vv v12, v9, v12
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v12
-; RV32ZVBC32-NEXT:    vlse64.v v12, (a2), zero
-; RV32ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV32ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV32ZVBC32-NEXT:    lui a0, 262144
-; RV32ZVBC32-NEXT:    vand.vv v11, v8, v12
-; RV32ZVBC32-NEXT:    vand.vx v8, v8, a0
-; RV32ZVBC32-NEXT:    vmul.vv v8, v9, v8
-; RV32ZVBC32-NEXT:    vxor.vv v8, v10, v8
-; RV32ZVBC32-NEXT:    vmul.vv v9, v9, v11
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v9
-; RV32ZVBC32-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; RV32ZVBC32-NEXT:    vnsrl.wx v8, v8, a1
-; RV32ZVBC32-NEXT:    addi sp, sp, 16
-; RV32ZVBC32-NEXT:    ret
-;
-; RV64ZVBC32-LABEL: clmulh_nxv1i32_vx:
-; RV64ZVBC32:       # %bb.0:
-; RV64ZVBC32-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
-; RV64ZVBC32-NEXT:    vmv.v.x v10, a0
-; RV64ZVBC32-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
-; RV64ZVBC32-NEXT:    vzext.vf2 v9, v8
-; RV64ZVBC32-NEXT:    li a1, 16
-; RV64ZVBC32-NEXT:    li a0, 32
-; RV64ZVBC32-NEXT:    li a2, 64
-; RV64ZVBC32-NEXT:    vzext.vf2 v8, v10
-; RV64ZVBC32-NEXT:    vand.vi v10, v8, 2
-; RV64ZVBC32-NEXT:    vand.vi v11, v8, 1
-; RV64ZVBC32-NEXT:    vmul.vv v10, v9, v10
-; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v11, v10
-; RV64ZVBC32-NEXT:    vand.vi v11, v8, 4
-; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vi v11, v8, 8
-; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a1
-; RV64ZVBC32-NEXT:    li a1, 128
-; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a0
-; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
-; RV64ZVBC32-NEXT:    li a2, 256
-; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a1
-; RV64ZVBC32-NEXT:    li a1, 512
-; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
-; RV64ZVBC32-NEXT:    li a2, 1024
-; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a1
-; RV64ZVBC32-NEXT:    li a1, 1
-; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
-; RV64ZVBC32-NEXT:    slli a2, a1, 11
-; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 1
-; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 2
-; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 4
-; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 8
-; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 16
-; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 32
-; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 64
-; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 128
-; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 256
-; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 512
-; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 1024
-; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 2048
-; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 4096
-; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 8192
-; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 16384
-; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 32768
-; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 65536
-; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 131072
-; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 262144
-; RV64ZVBC32-NEXT:    slli a1, a1, 31
-; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vand.vx v11, v8, a2
-; RV64ZVBC32-NEXT:    vand.vx v8, v8, a1
-; RV64ZVBC32-NEXT:    vmul.vv v11, v9, v11
-; RV64ZVBC32-NEXT:    vxor.vv v10, v10, v11
-; RV64ZVBC32-NEXT:    vmul.vv v8, v9, v8
-; RV64ZVBC32-NEXT:    vxor.vv v8, v10, v8
-; RV64ZVBC32-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; RV64ZVBC32-NEXT:    vnsrl.wx v8, v8, a0
-; RV64ZVBC32-NEXT:    ret
-  %elt.head = insertelement <vscale x 1 x i32> poison, i32 %b, i64 0
-  %vb = shufflevector <vscale x 1 x i32> %elt.head, <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
-  %va.ext = zext <vscale x 1 x i32> %va to <vscale x 1 x i64>
-  %vb.ext = zext <vscale x 1 x i32> %vb to <vscale x 1 x i64>
-  %clmul = call <vscale x 1 x i64> @llvm.clmul.nxv1i64(<vscale x 1 x i64> %va.ext, <vscale x 1 x i64> %vb.ext)
-  %res.ext = lshr <vscale x 1 x i64> %clmul, splat(i64 32)
-  %res = trunc <vscale x 1 x i64> %res.ext to <vscale x 1 x i32>
-  ret <vscale x 1 x i32> %res
-}
-
-define <vscale x 2 x i32> @clmulh_nxv2i32_vv(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb) nounwind {
-; RV32V-LABEL: clmulh_nxv2i32_vv:
-; RV32V:       # %bb.0:
-; RV32V-NEXT:    addi sp, sp, -16
-; RV32V-NEXT:    lui a0, 524288
-; RV32V-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32V-NEXT:    vzext.vf2 v10, v8
-; RV32V-NEXT:    vzext.vf2 v12, v9
-; RV32V-NEXT:    li a2, 16
-; RV32V-NEXT:    li a1, 32
-; RV32V-NEXT:    vand.vx v8, v12, a2
-; RV32V-NEXT:    li a2, 64
-; RV32V-NEXT:    vand.vi v14, v12, 2
-; RV32V-NEXT:    vand.vi v16, v12, 1
-; RV32V-NEXT:    vmul.vv v14, v10, v14
-; RV32V-NEXT:    vmul.vv v16, v10, v16
-; RV32V-NEXT:    vxor.vv v14, v16, v14
-; RV32V-NEXT:    vand.vi v16, v12, 4
-; RV32V-NEXT:    vmul.vv v16, v10, v16
-; RV32V-NEXT:    vxor.vv v14, v14, v16
-; RV32V-NEXT:    vand.vi v16, v12, 8
-; RV32V-NEXT:    vmul.vv v16, v10, v16
-; RV32V-NEXT:    vxor.vv v14, v14, v16
-; RV32V-NEXT:    vand.vx v16, v12, a2
-; RV32V-NEXT:    li a2, 128
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    vxor.vv v8, v14, v8
-; RV32V-NEXT:    vand.vx v14, v12, a1
-; RV32V-NEXT:    vmul.vv v14, v10, v14
-; RV32V-NEXT:    vxor.vv v8, v8, v14
-; RV32V-NEXT:    vand.vx v14, v12, a2
-; RV32V-NEXT:    li a2, 256
-; RV32V-NEXT:    vmul.vv v16, v10, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    vand.vx v16, v12, a2
-; RV32V-NEXT:    li a2, 512
-; RV32V-NEXT:    vmul.vv v14, v10, v14
-; RV32V-NEXT:    vxor.vv v8, v8, v14
-; RV32V-NEXT:    vand.vx v14, v12, a2
-; RV32V-NEXT:    li a2, 1024
-; RV32V-NEXT:    vmul.vv v16, v10, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    vand.vx v16, v12, a2
-; RV32V-NEXT:    lui a2, 1
-; RV32V-NEXT:    vmul.vv v14, v10, v14
-; RV32V-NEXT:    vxor.vv v8, v8, v14
-; RV32V-NEXT:    vand.vx v14, v12, a2
-; RV32V-NEXT:    li a2, 1
-; RV32V-NEXT:    slli a2, a2, 11
-; RV32V-NEXT:    vmul.vv v16, v10, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    vand.vx v16, v12, a2
-; RV32V-NEXT:    lui a2, 2
-; RV32V-NEXT:    vmul.vv v16, v10, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    vand.vx v16, v12, a2
-; RV32V-NEXT:    lui a2, 4
-; RV32V-NEXT:    vmul.vv v14, v10, v14
-; RV32V-NEXT:    vxor.vv v8, v8, v14
-; RV32V-NEXT:    vand.vx v14, v12, a2
-; RV32V-NEXT:    lui a2, 8
-; RV32V-NEXT:    vmul.vv v16, v10, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    vand.vx v16, v12, a2
-; RV32V-NEXT:    lui a2, 16
-; RV32V-NEXT:    vmul.vv v14, v10, v14
-; RV32V-NEXT:    vxor.vv v8, v8, v14
-; RV32V-NEXT:    vand.vx v14, v12, a2
-; RV32V-NEXT:    lui a2, 32
-; RV32V-NEXT:    vmul.vv v16, v10, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    vand.vx v16, v12, a2
-; RV32V-NEXT:    lui a2, 64
-; RV32V-NEXT:    vmul.vv v14, v10, v14
-; RV32V-NEXT:    vxor.vv v8, v8, v14
-; RV32V-NEXT:    vand.vx v14, v12, a2
-; RV32V-NEXT:    lui a2, 128
-; RV32V-NEXT:    vmul.vv v16, v10, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    vand.vx v16, v12, a2
-; RV32V-NEXT:    lui a2, 256
-; RV32V-NEXT:    vmul.vv v14, v10, v14
-; RV32V-NEXT:    vxor.vv v8, v8, v14
-; RV32V-NEXT:    vand.vx v14, v12, a2
-; RV32V-NEXT:    lui a2, 512
-; RV32V-NEXT:    vmul.vv v16, v10, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    vand.vx v16, v12, a2
-; RV32V-NEXT:    lui a2, 1024
-; RV32V-NEXT:    vmul.vv v14, v10, v14
-; RV32V-NEXT:    vxor.vv v8, v8, v14
-; RV32V-NEXT:    vand.vx v14, v12, a2
-; RV32V-NEXT:    lui a2, 2048
-; RV32V-NEXT:    vmul.vv v16, v10, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    vand.vx v16, v12, a2
-; RV32V-NEXT:    lui a2, 4096
-; RV32V-NEXT:    vmul.vv v14, v10, v14
-; RV32V-NEXT:    vxor.vv v8, v8, v14
-; RV32V-NEXT:    vand.vx v14, v12, a2
-; RV32V-NEXT:    lui a2, 8192
-; RV32V-NEXT:    vmul.vv v16, v10, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    vand.vx v16, v12, a2
-; RV32V-NEXT:    lui a2, 16384
-; RV32V-NEXT:    vmul.vv v14, v10, v14
-; RV32V-NEXT:    vxor.vv v8, v8, v14
-; RV32V-NEXT:    vand.vx v14, v12, a2
-; RV32V-NEXT:    lui a2, 32768
-; RV32V-NEXT:    vmul.vv v16, v10, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    vand.vx v16, v12, a2
-; RV32V-NEXT:    lui a2, 65536
-; RV32V-NEXT:    vmul.vv v14, v10, v14
-; RV32V-NEXT:    vxor.vv v8, v8, v14
-; RV32V-NEXT:    vand.vx v14, v12, a2
-; RV32V-NEXT:    lui a2, 131072
-; RV32V-NEXT:    vmul.vv v16, v10, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    vand.vx v16, v12, a2
-; RV32V-NEXT:    addi a2, sp, 8
-; RV32V-NEXT:    sw a0, 8(sp)
-; RV32V-NEXT:    sw zero, 12(sp)
-; RV32V-NEXT:    vmul.vv v14, v10, v14
-; RV32V-NEXT:    vxor.vv v8, v8, v14
-; RV32V-NEXT:    vlse64.v v14, (a2), zero
-; RV32V-NEXT:    vmul.vv v16, v10, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    lui a0, 262144
-; RV32V-NEXT:    vand.vv v14, v12, v14
-; RV32V-NEXT:    vand.vx v12, v12, a0
-; RV32V-NEXT:    vmul.vv v12, v10, v12
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    vmul.vv v10, v10, v14
-; RV32V-NEXT:    vxor.vv v10, v8, v10
-; RV32V-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV32V-NEXT:    vnsrl.wx v8, v10, a1
-; RV32V-NEXT:    addi sp, sp, 16
-; RV32V-NEXT:    ret
-;
-; RV64V-LABEL: clmulh_nxv2i32_vv:
-; RV64V:       # %bb.0:
-; RV64V-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV64V-NEXT:    vzext.vf2 v10, v8
-; RV64V-NEXT:    vzext.vf2 v12, v9
-; RV64V-NEXT:    li a1, 16
-; RV64V-NEXT:    li a0, 32
-; RV64V-NEXT:    li a2, 64
-; RV64V-NEXT:    vand.vi v8, v12, 2
-; RV64V-NEXT:    vand.vi v14, v12, 1
-; RV64V-NEXT:    vmul.vv v8, v10, v8
-; RV64V-NEXT:    vmul.vv v14, v10, v14
-; RV64V-NEXT:    vxor.vv v8, v14, v8
-; RV64V-NEXT:    vand.vi v14, v12, 4
-; RV64V-NEXT:    vmul.vv v14, v10, v14
-; RV64V-NEXT:    vxor.vv v8, v8, v14
-; RV64V-NEXT:    vand.vi v14, v12, 8
-; RV64V-NEXT:    vmul.vv v14, v10, v14
-; RV64V-NEXT:    vxor.vv v8, v8, v14
-; RV64V-NEXT:    vand.vx v14, v12, a1
-; RV64V-NEXT:    li a1, 128
-; RV64V-NEXT:    vmul.vv v14, v10, v14
-; RV64V-NEXT:    vxor.vv v8, v8, v14
-; RV64V-NEXT:    vand.vx v14, v12, a0
-; RV64V-NEXT:    vmul.vv v14, v10, v14
-; RV64V-NEXT:    vxor.vv v8, v8, v14
-; RV64V-NEXT:    vand.vx v14, v12, a2
-; RV64V-NEXT:    li a2, 256
-; RV64V-NEXT:    vmul.vv v14, v10, v14
-; RV64V-NEXT:    vxor.vv v8, v8, v14
-; RV64V-NEXT:    vand.vx v14, v12, a1
-; RV64V-NEXT:    li a1, 512
-; RV64V-NEXT:    vmul.vv v14, v10, v14
-; RV64V-NEXT:    vxor.vv v8, v8, v14
-; RV64V-NEXT:    vand.vx v14, v12, a2
-; RV64V-NEXT:    li a2, 1024
-; RV64V-NEXT:    vmul.vv v14, v10, v14
-; RV64V-NEXT:    vxor.vv v8, v8, v14
-; RV64V-NEXT:    vand.vx v14, v12, a1
-; RV64V-NEXT:    li a1, 1
-; RV64V-NEXT:    vmul.vv v14, v10, v14
-; RV64V-NEXT:    vxor.vv v8, v8, v14
-; RV64V-NEXT:    vand.vx v14, v12, a2
-; RV64V-NEXT:    slli a2, a1, 11
-; RV64V-NEXT:    vmul.vv v14, v10, v14
-; RV64V-NEXT:    vxor.vv v8, v8, v14
-; RV64V-NEXT:    vand.vx v14, v12, a2
-; RV64V-NEXT:    lui a2, 1
-; RV64V-NEXT:    vmul.vv v14, v10, v14
-; RV64V-NEXT:    vxor.vv v8, v8, v14
-; RV64V-NEXT:    vand.vx v14, v12, a2
-; RV64V-NEXT:    lui a2, 2
-; RV64V-NEXT:    vmul.vv v14, v10, v14
-; RV64V-NEXT:    vxor.vv v8, v8, v14
-; RV64V-NEXT:    vand.vx v14, v12, a2
-; RV64V-NEXT:    lui a2, 4
-; RV64V-NEXT:    vmul.vv v14, v10, v14
-; RV64V-NEXT:    vxor.vv v8, v8, v14
-; RV64V-NEXT:    vand.vx v14, v12, a2
-; RV64V-NEXT:    lui a2, 8
-; RV64V-NEXT:    vmul.vv v14, v10, v14
-; RV64V-NEXT:    vxor.vv v8, v8, v14
-; RV64V-NEXT:    vand.vx v14, v12, a2
-; RV64V-NEXT:    lui a2, 16
-; RV64V-NEXT:    vmul.vv v14, v10, v14
-; RV64V-NEXT:    vxor.vv v8, v8, v14
-; RV64V-NEXT:    vand.vx v14, v12, a2
-; RV64V-NEXT:    lui a2, 32
-; RV64V-NEXT:    vmul.vv v14, v10, v14
-; RV64V-NEXT:    vxor.vv v8, v8, v14
-; RV64V-NEXT:    vand.vx v14, v12, a2
-; RV64V-NEXT:    lui a2, 64
-; RV64V-NEXT:    vmul.vv v14, v10, v14
-; RV64V-NEXT:    vxor.vv v8, v8, v14
-; RV64V-NEXT:    vand.vx v14, v12, a2
-; RV64V-NEXT:    lui a2, 128
-; RV64V-NEXT:    vmul.vv v14, v10, v14
-; RV64V-NEXT:    vxor.vv v8, v8, v14
-; RV64V-NEXT:    vand.vx v14, v12, a2
-; RV64V-NEXT:    lui a2, 256
-; RV64V-NEXT:    vmul.vv v14, v10, v14
-; RV64V-NEXT:    vxor.vv v8, v8, v14
-; RV64V-NEXT:    vand.vx v14, v12, a2
-; RV64V-NEXT:    lui a2, 512
-; RV64V-NEXT:    vmul.vv v14, v10, v14
-; RV64V-NEXT:    vxor.vv v8, v8, v14
-; RV64V-NEXT:    vand.vx v14, v12, a2
-; RV64V-NEXT:    lui a2, 1024
-; RV64V-NEXT:    vmul.vv v14, v10, v14
-; RV64V-NEXT:    vxor.vv v8, v8, v14
-; RV64V-NEXT:    vand.vx v14, v12, a2
-; RV64V-NEXT:    lui a2, 2048
-; RV64V-NEXT:    vmul.vv v14, v10, v14
-; RV64V-NEXT:    vxor.vv v8, v8, v14
-; RV64V-NEXT:    vand.vx v14, v12, a2
-; RV64V-NEXT:    lui a2, 4096
-; RV64V-NEXT:    vmul.vv v14, v10, v14
-; RV64V-NEXT:    vxor.vv v8, v8, v14
-; RV64V-NEXT:    vand.vx v14, v12, a2
-; RV64V-NEXT:    lui a2, 8192
-; RV64V-NEXT:    vmul.vv v14, v10, v14
-; RV64V-NEXT:    vxor.vv v8, v8, v14
-; RV64V-NEXT:    vand.vx v14, v12, a2
-; RV64V-NEXT:    lui a2, 16384
-; RV64V-NEXT:    vmul.vv v14, v10, v14
-; RV64V-NEXT:    vxor.vv v8, v8, v14
-; RV64V-NEXT:    vand.vx v14, v12, a2
-; RV64V-NEXT:    lui a2, 32768
-; RV64V-NEXT:    vmul.vv v14, v10, v14
-; RV64V-NEXT:    vxor.vv v8, v8, v14
-; RV64V-NEXT:    vand.vx v14, v12, a2
-; RV64V-NEXT:    lui a2, 65536
-; RV64V-NEXT:    vmul.vv v14, v10, v14
-; RV64V-NEXT:    vxor.vv v8, v8, v14
-; RV64V-NEXT:    vand.vx v14, v12, a2
-; RV64V-NEXT:    lui a2, 131072
-; RV64V-NEXT:    vmul.vv v14, v10, v14
-; RV64V-NEXT:    vxor.vv v8, v8, v14
-; RV64V-NEXT:    vand.vx v14, v12, a2
-; RV64V-NEXT:    lui a2, 262144
-; RV64V-NEXT:    vmul.vv v14, v10, v14
-; RV64V-NEXT:    vxor.vv v8, v8, v14
-; RV64V-NEXT:    vand.vx v14, v12, a2
-; RV64V-NEXT:    slli a1, a1, 31
-; RV64V-NEXT:    vmul.vv v14, v10, v14
-; RV64V-NEXT:    vand.vx v12, v12, a1
-; RV64V-NEXT:    vxor.vv v8, v8, v14
-; RV64V-NEXT:    vmul.vv v10, v10, v12
-; RV64V-NEXT:    vxor.vv v10, v8, v10
-; RV64V-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV64V-NEXT:    vnsrl.wx v8, v10, a0
-; RV64V-NEXT:    ret
-;
-; RV32ZVBC64-LABEL: clmulh_nxv2i32_vv:
-; RV32ZVBC64:       # %bb.0:
-; RV32ZVBC64-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV32ZVBC64-NEXT:    vzext.vf2 v10, v9
-; RV32ZVBC64-NEXT:    vzext.vf2 v12, v8
-; RV32ZVBC64-NEXT:    vclmul.vv v10, v12, v10
-; RV32ZVBC64-NEXT:    li a0, 32
-; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV32ZVBC64-NEXT:    vnsrl.wx v8, v10, a0
-; RV32ZVBC64-NEXT:    ret
-;
-; RV64ZVBC64-LABEL: clmulh_nxv2i32_vv:
-; RV64ZVBC64:       # %bb.0:
-; RV64ZVBC64-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV64ZVBC64-NEXT:    vzext.vf2 v10, v9
-; RV64ZVBC64-NEXT:    vzext.vf2 v12, v8
-; RV64ZVBC64-NEXT:    vclmul.vv v10, v12, v10
-; RV64ZVBC64-NEXT:    li a0, 32
-; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVBC64-NEXT:    vnsrl.wx v8, v10, a0
-; RV64ZVBC64-NEXT:    ret
-;
-; RV32ZVBC32-LABEL: clmulh_nxv2i32_vv:
-; RV32ZVBC32:       # %bb.0:
-; RV32ZVBC32-NEXT:    addi sp, sp, -16
-; RV32ZVBC32-NEXT:    lui a0, 524288
-; RV32ZVBC32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32ZVBC32-NEXT:    vzext.vf2 v10, v8
-; RV32ZVBC32-NEXT:    vzext.vf2 v12, v9
-; RV32ZVBC32-NEXT:    li a2, 16
-; RV32ZVBC32-NEXT:    li a1, 32
-; RV32ZVBC32-NEXT:    vand.vx v8, v12, a2
-; RV32ZVBC32-NEXT:    li a2, 64
-; RV32ZVBC32-NEXT:    vand.vi v14, v12, 2
-; RV32ZVBC32-NEXT:    vand.vi v16, v12, 1
-; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV32ZVBC32-NEXT:    vmul.vv v16, v10, v16
-; RV32ZVBC32-NEXT:    vxor.vv v14, v16, v14
-; RV32ZVBC32-NEXT:    vand.vi v16, v12, 4
-; RV32ZVBC32-NEXT:    vmul.vv v16, v10, v16
-; RV32ZVBC32-NEXT:    vxor.vv v14, v14, v16
-; RV32ZVBC32-NEXT:    vand.vi v16, v12, 8
-; RV32ZVBC32-NEXT:    vmul.vv v16, v10, v16
-; RV32ZVBC32-NEXT:    vxor.vv v14, v14, v16
-; RV32ZVBC32-NEXT:    vand.vx v16, v12, a2
-; RV32ZVBC32-NEXT:    li a2, 128
-; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
-; RV32ZVBC32-NEXT:    vxor.vv v8, v14, v8
-; RV32ZVBC32-NEXT:    vand.vx v14, v12, a1
-; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v12, a2
-; RV32ZVBC32-NEXT:    li a2, 256
-; RV32ZVBC32-NEXT:    vmul.vv v16, v10, v16
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
-; RV32ZVBC32-NEXT:    vand.vx v16, v12, a2
-; RV32ZVBC32-NEXT:    li a2, 512
-; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v12, a2
-; RV32ZVBC32-NEXT:    li a2, 1024
-; RV32ZVBC32-NEXT:    vmul.vv v16, v10, v16
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
-; RV32ZVBC32-NEXT:    vand.vx v16, v12, a2
-; RV32ZVBC32-NEXT:    lui a2, 1
-; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v12, a2
-; RV32ZVBC32-NEXT:    li a2, 1
-; RV32ZVBC32-NEXT:    slli a2, a2, 11
-; RV32ZVBC32-NEXT:    vmul.vv v16, v10, v16
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
-; RV32ZVBC32-NEXT:    vand.vx v16, v12, a2
-; RV32ZVBC32-NEXT:    lui a2, 2
-; RV32ZVBC32-NEXT:    vmul.vv v16, v10, v16
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
-; RV32ZVBC32-NEXT:    vand.vx v16, v12, a2
-; RV32ZVBC32-NEXT:    lui a2, 4
-; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v12, a2
-; RV32ZVBC32-NEXT:    lui a2, 8
-; RV32ZVBC32-NEXT:    vmul.vv v16, v10, v16
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
-; RV32ZVBC32-NEXT:    vand.vx v16, v12, a2
-; RV32ZVBC32-NEXT:    lui a2, 16
-; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v12, a2
-; RV32ZVBC32-NEXT:    lui a2, 32
-; RV32ZVBC32-NEXT:    vmul.vv v16, v10, v16
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
-; RV32ZVBC32-NEXT:    vand.vx v16, v12, a2
-; RV32ZVBC32-NEXT:    lui a2, 64
-; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v12, a2
-; RV32ZVBC32-NEXT:    lui a2, 128
-; RV32ZVBC32-NEXT:    vmul.vv v16, v10, v16
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
-; RV32ZVBC32-NEXT:    vand.vx v16, v12, a2
-; RV32ZVBC32-NEXT:    lui a2, 256
-; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v12, a2
-; RV32ZVBC32-NEXT:    lui a2, 512
-; RV32ZVBC32-NEXT:    vmul.vv v16, v10, v16
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
-; RV32ZVBC32-NEXT:    vand.vx v16, v12, a2
-; RV32ZVBC32-NEXT:    lui a2, 1024
-; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v12, a2
-; RV32ZVBC32-NEXT:    lui a2, 2048
-; RV32ZVBC32-NEXT:    vmul.vv v16, v10, v16
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
-; RV32ZVBC32-NEXT:    vand.vx v16, v12, a2
-; RV32ZVBC32-NEXT:    lui a2, 4096
-; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v12, a2
-; RV32ZVBC32-NEXT:    lui a2, 8192
-; RV32ZVBC32-NEXT:    vmul.vv v16, v10, v16
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
-; RV32ZVBC32-NEXT:    vand.vx v16, v12, a2
-; RV32ZVBC32-NEXT:    lui a2, 16384
-; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v12, a2
-; RV32ZVBC32-NEXT:    lui a2, 32768
-; RV32ZVBC32-NEXT:    vmul.vv v16, v10, v16
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
-; RV32ZVBC32-NEXT:    vand.vx v16, v12, a2
-; RV32ZVBC32-NEXT:    lui a2, 65536
-; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v12, a2
-; RV32ZVBC32-NEXT:    lui a2, 131072
-; RV32ZVBC32-NEXT:    vmul.vv v16, v10, v16
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
-; RV32ZVBC32-NEXT:    vand.vx v16, v12, a2
-; RV32ZVBC32-NEXT:    addi a2, sp, 8
-; RV32ZVBC32-NEXT:    sw a0, 8(sp)
-; RV32ZVBC32-NEXT:    sw zero, 12(sp)
-; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV32ZVBC32-NEXT:    vlse64.v v14, (a2), zero
-; RV32ZVBC32-NEXT:    vmul.vv v16, v10, v16
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
-; RV32ZVBC32-NEXT:    lui a0, 262144
-; RV32ZVBC32-NEXT:    vand.vv v14, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v12, v12, a0
-; RV32ZVBC32-NEXT:    vmul.vv v12, v10, v12
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v12
-; RV32ZVBC32-NEXT:    vmul.vv v10, v10, v14
-; RV32ZVBC32-NEXT:    vxor.vv v10, v8, v10
-; RV32ZVBC32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV32ZVBC32-NEXT:    vnsrl.wx v8, v10, a1
-; RV32ZVBC32-NEXT:    addi sp, sp, 16
+; RV64V-NEXT:    vand.vx v16, v16, a1
+; RV64V-NEXT:    vsll.vi v8, v8, 2
+; RV64V-NEXT:    vor.vv v8, v16, v8
+; RV64V-NEXT:    vsrl.vi v16, v8, 1
+; RV64V-NEXT:    vand.vx v8, v8, a0
+; RV64V-NEXT:    vand.vx v16, v16, a0
+; RV64V-NEXT:    vadd.vv v8, v8, v8
+; RV64V-NEXT:    vor.vv v8, v16, v8
+; RV64V-NEXT:    vand.vx v16, v0, a6
+; RV64V-NEXT:    csrr a6, vlenb
+; RV64V-NEXT:    slli a6, a6, 4
+; RV64V-NEXT:    mv t4, a6
+; RV64V-NEXT:    slli a6, a6, 1
+; RV64V-NEXT:    add t4, t4, a6
+; RV64V-NEXT:    slli a6, a6, 1
+; RV64V-NEXT:    add a6, a6, t4
+; RV64V-NEXT:    add a6, sp, a6
+; RV64V-NEXT:    addi a6, a6, 16
+; RV64V-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    lui a6, 8
+; RV64V-NEXT:    slli t5, t5, 11
+; RV64V-NEXT:    vand.vx v16, v0, a4
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv t4, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add t4, t4, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, t4
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 16
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, a5
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 5
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 16
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, t0
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 16
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, t1
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 4
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 16
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, t2
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 16
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, t5
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 6
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 16
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, a3
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, t3
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 4
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vi v16, v0, 2
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vi v24, v0, 1
+; RV64V-NEXT:    vand.vi v16, v0, 4
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vi v16, v0, 8
+; RV64V-NEXT:    addi a3, sp, 16
+; RV64V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, a7
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 5
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, a6
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vmul.vv v16, v8, v24
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v24, v8, v16
+; RV64V-NEXT:    addi a3, sp, 16
+; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v0, v8, v16
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 4
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 4
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 5
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 5
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 4
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 4
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 6
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 6
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 4
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 4
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 5
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 5
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v8, v8, v16
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v16, v8, v16
+; RV64V-NEXT:    vxor.vv v16, v16, v24
+; RV64V-NEXT:    vxor.vv v16, v16, v0
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v16, v16, v8
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 4
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v16, v16, v8
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v16, v16, v8
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 5
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v16, v16, v8
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v24, v16, v8
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 4
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v24, v24, v8
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v24, v24, v8
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 6
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v24, v24, v8
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v24, v24, v8
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 4
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v24, v24, v8
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 5
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v24, v24, v8
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 16
+; RV64V-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v8, v24, v8
+; RV64V-NEXT:    vsll.vi v16, v16, 8
+; RV64V-NEXT:    vsrl.vi v8, v8, 8
+; RV64V-NEXT:    vor.vv v8, v16, v8
+; RV64V-NEXT:    vsrl.vi v16, v8, 4
+; RV64V-NEXT:    vand.vx v8, v8, a2
+; RV64V-NEXT:    vand.vx v16, v16, a2
+; RV64V-NEXT:    vsll.vi v8, v8, 4
+; RV64V-NEXT:    vor.vv v8, v16, v8
+; RV64V-NEXT:    vsrl.vi v16, v8, 2
+; RV64V-NEXT:    vand.vx v8, v8, a1
+; RV64V-NEXT:    vand.vx v16, v16, a1
+; RV64V-NEXT:    vsll.vi v8, v8, 2
+; RV64V-NEXT:    vor.vv v8, v16, v8
+; RV64V-NEXT:    vsrl.vi v16, v8, 1
+; RV64V-NEXT:    vand.vx v8, v8, a0
+; RV64V-NEXT:    vand.vx v16, v16, a0
+; RV64V-NEXT:    vadd.vv v8, v8, v8
+; RV64V-NEXT:    vor.vv v8, v16, v8
+; RV64V-NEXT:    vsrl.vi v8, v8, 1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 7
+; RV64V-NEXT:    add sp, sp, a0
+; RV64V-NEXT:    addi sp, sp, 16
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC64-LABEL: clmulh_nxv32i16_vx:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    addi sp, sp, -16
+; RV32ZVBC64-NEXT:    csrr a1, vlenb
+; RV32ZVBC64-NEXT:    slli a1, a1, 7
+; RV32ZVBC64-NEXT:    sub sp, sp, a1
+; RV32ZVBC64-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; RV32ZVBC64-NEXT:    vmv.v.x v24, a0
+; RV32ZVBC64-NEXT:    vsrl.vi v16, v8, 8
+; RV32ZVBC64-NEXT:    vsll.vi v8, v8, 8
+; RV32ZVBC64-NEXT:    lui a3, 1
+; RV32ZVBC64-NEXT:    lui a0, 3
+; RV32ZVBC64-NEXT:    lui a7, 5
+; RV32ZVBC64-NEXT:    li t4, 16
+; RV32ZVBC64-NEXT:    li a6, 32
+; RV32ZVBC64-NEXT:    li a4, 64
+; RV32ZVBC64-NEXT:    li a5, 128
+; RV32ZVBC64-NEXT:    li t0, 256
+; RV32ZVBC64-NEXT:    li t1, 512
+; RV32ZVBC64-NEXT:    li t2, 1024
+; RV32ZVBC64-NEXT:    li t5, 1
+; RV32ZVBC64-NEXT:    lui t3, 2
+; RV32ZVBC64-NEXT:    addi a2, a3, -241
+; RV32ZVBC64-NEXT:    addi a1, a0, 819
+; RV32ZVBC64-NEXT:    addi a0, a7, 1365
+; RV32ZVBC64-NEXT:    vsrl.vi v0, v24, 8
+; RV32ZVBC64-NEXT:    vsll.vi v24, v24, 8
+; RV32ZVBC64-NEXT:    vor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vsrl.vi v0, v24, 4
+; RV32ZVBC64-NEXT:    vand.vx v24, v24, a2
+; RV32ZVBC64-NEXT:    vand.vx v0, v0, a2
+; RV32ZVBC64-NEXT:    vsll.vi v24, v24, 4
+; RV32ZVBC64-NEXT:    vor.vv v24, v0, v24
+; RV32ZVBC64-NEXT:    vsrl.vi v0, v24, 2
+; RV32ZVBC64-NEXT:    vand.vx v24, v24, a1
+; RV32ZVBC64-NEXT:    vand.vx v0, v0, a1
+; RV32ZVBC64-NEXT:    vsll.vi v24, v24, 2
+; RV32ZVBC64-NEXT:    vor.vv v24, v0, v24
+; RV32ZVBC64-NEXT:    vsrl.vi v0, v24, 1
+; RV32ZVBC64-NEXT:    vand.vx v24, v24, a0
+; RV32ZVBC64-NEXT:    vand.vx v0, v0, a0
+; RV32ZVBC64-NEXT:    vadd.vv v24, v24, v24
+; RV32ZVBC64-NEXT:    vor.vv v0, v0, v24
+; RV32ZVBC64-NEXT:    vand.vx v24, v0, t4
+; RV32ZVBC64-NEXT:    csrr a7, vlenb
+; RV32ZVBC64-NEXT:    slli a7, a7, 3
+; RV32ZVBC64-NEXT:    mv t4, a7
+; RV32ZVBC64-NEXT:    slli a7, a7, 1
+; RV32ZVBC64-NEXT:    add t4, t4, a7
+; RV32ZVBC64-NEXT:    slli a7, a7, 1
+; RV32ZVBC64-NEXT:    add t4, t4, a7
+; RV32ZVBC64-NEXT:    slli a7, a7, 1
+; RV32ZVBC64-NEXT:    add a7, a7, t4
+; RV32ZVBC64-NEXT:    add a7, sp, a7
+; RV32ZVBC64-NEXT:    addi a7, a7, 16
+; RV32ZVBC64-NEXT:    vs8r.v v24, (a7) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    lui a7, 4
+; RV32ZVBC64-NEXT:    vor.vv v8, v8, v16
+; RV32ZVBC64-NEXT:    vsrl.vi v16, v8, 4
+; RV32ZVBC64-NEXT:    vand.vx v8, v8, a2
+; RV32ZVBC64-NEXT:    vand.vx v16, v16, a2
+; RV32ZVBC64-NEXT:    vsll.vi v8, v8, 4
+; RV32ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV32ZVBC64-NEXT:    vsrl.vi v16, v8, 2
+; RV32ZVBC64-NEXT:    vand.vx v8, v8, a1
+; RV32ZVBC64-NEXT:    vand.vx v16, v16, a1
+; RV32ZVBC64-NEXT:    vsll.vi v8, v8, 2
+; RV32ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV32ZVBC64-NEXT:    vsrl.vi v16, v8, 1
+; RV32ZVBC64-NEXT:    vand.vx v8, v8, a0
+; RV32ZVBC64-NEXT:    vand.vx v16, v16, a0
+; RV32ZVBC64-NEXT:    vadd.vv v8, v8, v8
+; RV32ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV32ZVBC64-NEXT:    vand.vx v16, v0, a6
+; RV32ZVBC64-NEXT:    csrr a6, vlenb
+; RV32ZVBC64-NEXT:    slli a6, a6, 4
+; RV32ZVBC64-NEXT:    mv t4, a6
+; RV32ZVBC64-NEXT:    slli a6, a6, 1
+; RV32ZVBC64-NEXT:    add t4, t4, a6
+; RV32ZVBC64-NEXT:    slli a6, a6, 1
+; RV32ZVBC64-NEXT:    add a6, a6, t4
+; RV32ZVBC64-NEXT:    add a6, sp, a6
+; RV32ZVBC64-NEXT:    addi a6, a6, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    lui a6, 8
+; RV32ZVBC64-NEXT:    slli t5, t5, 11
+; RV32ZVBC64-NEXT:    vand.vx v16, v0, a4
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 3
+; RV32ZVBC64-NEXT:    mv t4, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 2
+; RV32ZVBC64-NEXT:    add t4, t4, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a4, a4, t4
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    vand.vx v16, v0, a5
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 5
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    vand.vx v16, v0, t0
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 3
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a5, a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 2
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    vand.vx v16, v0, t1
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 4
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 2
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    vand.vx v16, v0, t2
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 3
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 3
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    vand.vx v16, v0, t5
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 6
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    vand.vx v16, v0, a3
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 3
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    vand.vx v16, v0, t3
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 4
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    vand.vi v16, v0, 2
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 3
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 2
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    vand.vi v24, v0, 1
+; RV32ZVBC64-NEXT:    vand.vi v16, v0, 4
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 3
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    vand.vi v16, v0, 8
+; RV32ZVBC64-NEXT:    addi a3, sp, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    vand.vx v16, v0, a7
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 5
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    vand.vx v16, v0, a6
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 3
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 3
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 2
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 3
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 2
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    vmul.vv v16, v8, v24
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 3
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v24, v8, v16
+; RV32ZVBC64-NEXT:    addi a3, sp, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v16
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 3
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 3
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 4
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 4
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 3
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 2
+; RV32ZVBC64-NEXT:    add a4, a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 3
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 2
+; RV32ZVBC64-NEXT:    add a4, a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 5
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 5
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 3
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 2
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 3
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 2
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 4
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 2
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 4
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 2
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 3
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 3
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 3
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 3
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 6
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 6
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 3
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 3
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 4
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 4
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 5
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 5
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 3
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v8, v8, v16
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 3
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 3
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 2
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vxor.vv v16, v8, v16
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v0
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 3
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v8
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 4
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v8
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 3
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 2
+; RV32ZVBC64-NEXT:    add a4, a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v8
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 5
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v8
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 3
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 2
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vxor.vv v24, v16, v8
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 4
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 2
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v8
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 3
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 3
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v8
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 6
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v8
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 3
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v8
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 4
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v8
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 5
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v8
+; RV32ZVBC64-NEXT:    csrr a3, vlenb
+; RV32ZVBC64-NEXT:    slli a3, a3, 3
+; RV32ZVBC64-NEXT:    mv a4, a3
+; RV32ZVBC64-NEXT:    slli a3, a3, 1
+; RV32ZVBC64-NEXT:    add a3, a3, a4
+; RV32ZVBC64-NEXT:    add a3, sp, a3
+; RV32ZVBC64-NEXT:    addi a3, a3, 16
+; RV32ZVBC64-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vxor.vv v8, v24, v8
+; RV32ZVBC64-NEXT:    vsll.vi v16, v16, 8
+; RV32ZVBC64-NEXT:    vsrl.vi v8, v8, 8
+; RV32ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV32ZVBC64-NEXT:    vsrl.vi v16, v8, 4
+; RV32ZVBC64-NEXT:    vand.vx v8, v8, a2
+; RV32ZVBC64-NEXT:    vand.vx v16, v16, a2
+; RV32ZVBC64-NEXT:    vsll.vi v8, v8, 4
+; RV32ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV32ZVBC64-NEXT:    vsrl.vi v16, v8, 2
+; RV32ZVBC64-NEXT:    vand.vx v8, v8, a1
+; RV32ZVBC64-NEXT:    vand.vx v16, v16, a1
+; RV32ZVBC64-NEXT:    vsll.vi v8, v8, 2
+; RV32ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV32ZVBC64-NEXT:    vsrl.vi v16, v8, 1
+; RV32ZVBC64-NEXT:    vand.vx v8, v8, a0
+; RV32ZVBC64-NEXT:    vand.vx v16, v16, a0
+; RV32ZVBC64-NEXT:    vadd.vv v8, v8, v8
+; RV32ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV32ZVBC64-NEXT:    vsrl.vi v8, v8, 1
+; RV32ZVBC64-NEXT:    csrr a0, vlenb
+; RV32ZVBC64-NEXT:    slli a0, a0, 7
+; RV32ZVBC64-NEXT:    add sp, sp, a0
+; RV32ZVBC64-NEXT:    addi sp, sp, 16
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmulh_nxv32i16_vx:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    addi sp, sp, -16
+; RV64ZVBC64-NEXT:    csrr a1, vlenb
+; RV64ZVBC64-NEXT:    slli a1, a1, 7
+; RV64ZVBC64-NEXT:    sub sp, sp, a1
+; RV64ZVBC64-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; RV64ZVBC64-NEXT:    vmv.v.x v24, a0
+; RV64ZVBC64-NEXT:    vsrl.vi v16, v8, 8
+; RV64ZVBC64-NEXT:    vsll.vi v8, v8, 8
+; RV64ZVBC64-NEXT:    lui a3, 1
+; RV64ZVBC64-NEXT:    lui a0, 3
+; RV64ZVBC64-NEXT:    lui a7, 5
+; RV64ZVBC64-NEXT:    li t4, 16
+; RV64ZVBC64-NEXT:    li a6, 32
+; RV64ZVBC64-NEXT:    li a4, 64
+; RV64ZVBC64-NEXT:    li a5, 128
+; RV64ZVBC64-NEXT:    li t0, 256
+; RV64ZVBC64-NEXT:    li t1, 512
+; RV64ZVBC64-NEXT:    li t2, 1024
+; RV64ZVBC64-NEXT:    li t5, 1
+; RV64ZVBC64-NEXT:    lui t3, 2
+; RV64ZVBC64-NEXT:    addi a2, a3, -241
+; RV64ZVBC64-NEXT:    addi a1, a0, 819
+; RV64ZVBC64-NEXT:    addi a0, a7, 1365
+; RV64ZVBC64-NEXT:    vsrl.vi v0, v24, 8
+; RV64ZVBC64-NEXT:    vsll.vi v24, v24, 8
+; RV64ZVBC64-NEXT:    vor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    vsrl.vi v0, v24, 4
+; RV64ZVBC64-NEXT:    vand.vx v24, v24, a2
+; RV64ZVBC64-NEXT:    vand.vx v0, v0, a2
+; RV64ZVBC64-NEXT:    vsll.vi v24, v24, 4
+; RV64ZVBC64-NEXT:    vor.vv v24, v0, v24
+; RV64ZVBC64-NEXT:    vsrl.vi v0, v24, 2
+; RV64ZVBC64-NEXT:    vand.vx v24, v24, a1
+; RV64ZVBC64-NEXT:    vand.vx v0, v0, a1
+; RV64ZVBC64-NEXT:    vsll.vi v24, v24, 2
+; RV64ZVBC64-NEXT:    vor.vv v24, v0, v24
+; RV64ZVBC64-NEXT:    vsrl.vi v0, v24, 1
+; RV64ZVBC64-NEXT:    vand.vx v24, v24, a0
+; RV64ZVBC64-NEXT:    vand.vx v0, v0, a0
+; RV64ZVBC64-NEXT:    vadd.vv v24, v24, v24
+; RV64ZVBC64-NEXT:    vor.vv v0, v0, v24
+; RV64ZVBC64-NEXT:    vand.vx v24, v0, t4
+; RV64ZVBC64-NEXT:    csrr a7, vlenb
+; RV64ZVBC64-NEXT:    slli a7, a7, 3
+; RV64ZVBC64-NEXT:    mv t4, a7
+; RV64ZVBC64-NEXT:    slli a7, a7, 1
+; RV64ZVBC64-NEXT:    add t4, t4, a7
+; RV64ZVBC64-NEXT:    slli a7, a7, 1
+; RV64ZVBC64-NEXT:    add t4, t4, a7
+; RV64ZVBC64-NEXT:    slli a7, a7, 1
+; RV64ZVBC64-NEXT:    add a7, a7, t4
+; RV64ZVBC64-NEXT:    add a7, sp, a7
+; RV64ZVBC64-NEXT:    addi a7, a7, 16
+; RV64ZVBC64-NEXT:    vs8r.v v24, (a7) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    lui a7, 4
+; RV64ZVBC64-NEXT:    vor.vv v8, v8, v16
+; RV64ZVBC64-NEXT:    vsrl.vi v16, v8, 4
+; RV64ZVBC64-NEXT:    vand.vx v8, v8, a2
+; RV64ZVBC64-NEXT:    vand.vx v16, v16, a2
+; RV64ZVBC64-NEXT:    vsll.vi v8, v8, 4
+; RV64ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV64ZVBC64-NEXT:    vsrl.vi v16, v8, 2
+; RV64ZVBC64-NEXT:    vand.vx v8, v8, a1
+; RV64ZVBC64-NEXT:    vand.vx v16, v16, a1
+; RV64ZVBC64-NEXT:    vsll.vi v8, v8, 2
+; RV64ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV64ZVBC64-NEXT:    vsrl.vi v16, v8, 1
+; RV64ZVBC64-NEXT:    vand.vx v8, v8, a0
+; RV64ZVBC64-NEXT:    vand.vx v16, v16, a0
+; RV64ZVBC64-NEXT:    vadd.vv v8, v8, v8
+; RV64ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, a6
+; RV64ZVBC64-NEXT:    csrr a6, vlenb
+; RV64ZVBC64-NEXT:    slli a6, a6, 4
+; RV64ZVBC64-NEXT:    mv t4, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 1
+; RV64ZVBC64-NEXT:    add t4, t4, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 1
+; RV64ZVBC64-NEXT:    add a6, a6, t4
+; RV64ZVBC64-NEXT:    add a6, sp, a6
+; RV64ZVBC64-NEXT:    addi a6, a6, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    lui a6, 8
+; RV64ZVBC64-NEXT:    slli t5, t5, 11
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, a4
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv t4, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 2
+; RV64ZVBC64-NEXT:    add t4, t4, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, t4
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, a5
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 5
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, t0
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 2
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, t1
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 4
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 2
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, t2
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, t5
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 6
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, a3
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 3
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, t3
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 4
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vi v16, v0, 2
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 3
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 2
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vi v24, v0, 1
+; RV64ZVBC64-NEXT:    vand.vi v16, v0, 4
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 3
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vi v16, v0, 8
+; RV64ZVBC64-NEXT:    addi a3, sp, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, a7
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 5
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, a6
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 3
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 3
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 2
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 3
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 2
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v24
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 3
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v24, v8, v16
+; RV64ZVBC64-NEXT:    addi a3, sp, 16
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v16
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 3
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 3
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 4
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 4
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 3
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 2
+; RV64ZVBC64-NEXT:    add a4, a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 3
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 2
+; RV64ZVBC64-NEXT:    add a4, a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 5
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 5
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 3
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 2
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 3
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 2
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 4
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 2
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 4
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 2
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 3
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 3
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 3
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 3
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 6
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 6
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 3
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 3
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 4
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 4
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 5
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 5
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 3
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v8, v8, v16
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 3
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 3
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 2
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC64-NEXT:    vxor.vv v16, v16, v0
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 3
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v16, v16, v8
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 4
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v16, v16, v8
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 3
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 2
+; RV64ZVBC64-NEXT:    add a4, a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v16, v16, v8
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 5
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v16, v16, v8
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 3
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 2
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v24, v16, v8
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 4
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 2
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v8
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 3
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 3
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v8
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 6
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v8
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 3
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v8
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 4
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v8
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 5
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v8
+; RV64ZVBC64-NEXT:    csrr a3, vlenb
+; RV64ZVBC64-NEXT:    slli a3, a3, 3
+; RV64ZVBC64-NEXT:    mv a4, a3
+; RV64ZVBC64-NEXT:    slli a3, a3, 1
+; RV64ZVBC64-NEXT:    add a3, a3, a4
+; RV64ZVBC64-NEXT:    add a3, sp, a3
+; RV64ZVBC64-NEXT:    addi a3, a3, 16
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v8, v24, v8
+; RV64ZVBC64-NEXT:    vsll.vi v16, v16, 8
+; RV64ZVBC64-NEXT:    vsrl.vi v8, v8, 8
+; RV64ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV64ZVBC64-NEXT:    vsrl.vi v16, v8, 4
+; RV64ZVBC64-NEXT:    vand.vx v8, v8, a2
+; RV64ZVBC64-NEXT:    vand.vx v16, v16, a2
+; RV64ZVBC64-NEXT:    vsll.vi v8, v8, 4
+; RV64ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV64ZVBC64-NEXT:    vsrl.vi v16, v8, 2
+; RV64ZVBC64-NEXT:    vand.vx v8, v8, a1
+; RV64ZVBC64-NEXT:    vand.vx v16, v16, a1
+; RV64ZVBC64-NEXT:    vsll.vi v8, v8, 2
+; RV64ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV64ZVBC64-NEXT:    vsrl.vi v16, v8, 1
+; RV64ZVBC64-NEXT:    vand.vx v8, v8, a0
+; RV64ZVBC64-NEXT:    vand.vx v16, v16, a0
+; RV64ZVBC64-NEXT:    vadd.vv v8, v8, v8
+; RV64ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV64ZVBC64-NEXT:    vsrl.vi v8, v8, 1
+; RV64ZVBC64-NEXT:    csrr a0, vlenb
+; RV64ZVBC64-NEXT:    slli a0, a0, 7
+; RV64ZVBC64-NEXT:    add sp, sp, a0
+; RV64ZVBC64-NEXT:    addi sp, sp, 16
+; RV64ZVBC64-NEXT:    ret
+;
+; RV32ZVBC32-LABEL: clmulh_nxv32i16_vx:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; RV32ZVBC32-NEXT:    vclmulh.vx v8, v8, a0
 ; RV32ZVBC32-NEXT:    ret
 ;
-; RV64ZVBC32-LABEL: clmulh_nxv2i32_vv:
+; RV64ZVBC32-LABEL: clmulh_nxv32i16_vx:
 ; RV64ZVBC32:       # %bb.0:
-; RV64ZVBC32-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV64ZVBC32-NEXT:    vzext.vf2 v10, v8
-; RV64ZVBC32-NEXT:    vzext.vf2 v12, v9
-; RV64ZVBC32-NEXT:    li a1, 16
-; RV64ZVBC32-NEXT:    li a0, 32
-; RV64ZVBC32-NEXT:    li a2, 64
-; RV64ZVBC32-NEXT:    vand.vi v8, v12, 2
-; RV64ZVBC32-NEXT:    vand.vi v14, v12, 1
-; RV64ZVBC32-NEXT:    vmul.vv v8, v10, v8
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v8, v14, v8
-; RV64ZVBC32-NEXT:    vand.vi v14, v12, 4
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV64ZVBC32-NEXT:    vand.vi v14, v12, 8
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v12, a1
-; RV64ZVBC32-NEXT:    li a1, 128
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v12, a0
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v12, a2
-; RV64ZVBC32-NEXT:    li a2, 256
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v12, a1
-; RV64ZVBC32-NEXT:    li a1, 512
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v12, a2
-; RV64ZVBC32-NEXT:    li a2, 1024
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v12, a1
-; RV64ZVBC32-NEXT:    li a1, 1
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v12, a2
-; RV64ZVBC32-NEXT:    slli a2, a1, 11
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v12, a2
-; RV64ZVBC32-NEXT:    lui a2, 1
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v12, a2
-; RV64ZVBC32-NEXT:    lui a2, 2
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v12, a2
-; RV64ZVBC32-NEXT:    lui a2, 4
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v12, a2
-; RV64ZVBC32-NEXT:    lui a2, 8
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v12, a2
-; RV64ZVBC32-NEXT:    lui a2, 16
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v12, a2
-; RV64ZVBC32-NEXT:    lui a2, 32
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v12, a2
-; RV64ZVBC32-NEXT:    lui a2, 64
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v12, a2
-; RV64ZVBC32-NEXT:    lui a2, 128
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v12, a2
-; RV64ZVBC32-NEXT:    lui a2, 256
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v12, a2
-; RV64ZVBC32-NEXT:    lui a2, 512
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v12, a2
-; RV64ZVBC32-NEXT:    lui a2, 1024
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v12, a2
-; RV64ZVBC32-NEXT:    lui a2, 2048
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v12, a2
-; RV64ZVBC32-NEXT:    lui a2, 4096
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v12, a2
-; RV64ZVBC32-NEXT:    lui a2, 8192
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v12, a2
-; RV64ZVBC32-NEXT:    lui a2, 16384
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v12, a2
-; RV64ZVBC32-NEXT:    lui a2, 32768
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v12, a2
-; RV64ZVBC32-NEXT:    lui a2, 65536
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v12, a2
-; RV64ZVBC32-NEXT:    lui a2, 131072
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v12, a2
-; RV64ZVBC32-NEXT:    lui a2, 262144
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v12, a2
-; RV64ZVBC32-NEXT:    slli a1, a1, 31
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vand.vx v12, v12, a1
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v14
-; RV64ZVBC32-NEXT:    vmul.vv v10, v10, v12
-; RV64ZVBC32-NEXT:    vxor.vv v10, v8, v10
-; RV64ZVBC32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVBC32-NEXT:    vnsrl.wx v8, v10, a0
+; RV64ZVBC32-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; RV64ZVBC32-NEXT:    vclmulh.vx v8, v8, a0
 ; RV64ZVBC32-NEXT:    ret
-  %va.ext = zext <vscale x 2 x i32> %va to <vscale x 2 x i64>
-  %vb.ext = zext <vscale x 2 x i32> %vb to <vscale x 2 x i64>
-  %clmul = call <vscale x 2 x i64> @llvm.clmul.nxv2i64(<vscale x 2 x i64> %va.ext, <vscale x 2 x i64> %vb.ext)
-  %res.ext = lshr <vscale x 2 x i64> %clmul, splat(i64 32)
-  %res = trunc <vscale x 2 x i64> %res.ext to <vscale x 2 x i32>
-  ret <vscale x 2 x i32> %res
+  %elt.head = insertelement <vscale x 32 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 32 x i16> %elt.head, <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer
+  %va.ext = zext <vscale x 32 x i16> %va to <vscale x 32 x i32>
+  %vb.ext = zext <vscale x 32 x i16> %vb to <vscale x 32 x i32>
+  %clmul = call <vscale x 32 x i32> @llvm.clmul.nxv32i32(<vscale x 32 x i32> %va.ext, <vscale x 32 x i32> %vb.ext)
+  %res.ext = lshr <vscale x 32 x i32> %clmul, splat(i32 16)
+  %res = trunc <vscale x 32 x i32> %res.ext to <vscale x 32 x i16>
+  ret <vscale x 32 x i16> %res
 }
 
-define <vscale x 2 x i32> @clmulh_nxv2i32_vx(<vscale x 2 x i32> %va, i32 %b) nounwind {
-; RV32V-LABEL: clmulh_nxv2i32_vx:
+define <vscale x 1 x i32> @clmulh_nxv1i32_vv(<vscale x 1 x i32> %va, <vscale x 1 x i32> %vb) nounwind {
+; RV32V-LABEL: clmulh_nxv1i32_vv:
 ; RV32V:       # %bb.0:
 ; RV32V-NEXT:    addi sp, sp, -16
-; RV32V-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV32V-NEXT:    vmv.v.x v12, a0
 ; RV32V-NEXT:    lui a0, 524288
-; RV32V-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; RV32V-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
 ; RV32V-NEXT:    vzext.vf2 v10, v8
+; RV32V-NEXT:    vzext.vf2 v8, v9
 ; RV32V-NEXT:    li a2, 16
 ; RV32V-NEXT:    li a1, 32
-; RV32V-NEXT:    vzext.vf2 v8, v12
-; RV32V-NEXT:    vand.vx v12, v8, a2
+; RV32V-NEXT:    vand.vx v9, v8, a2
 ; RV32V-NEXT:    li a2, 64
-; RV32V-NEXT:    vand.vi v14, v8, 2
-; RV32V-NEXT:    vand.vi v16, v8, 1
-; RV32V-NEXT:    vmul.vv v14, v10, v14
-; RV32V-NEXT:    vmul.vv v16, v10, v16
-; RV32V-NEXT:    vxor.vv v14, v16, v14
-; RV32V-NEXT:    vand.vi v16, v8, 4
-; RV32V-NEXT:    vmul.vv v16, v10, v16
-; RV32V-NEXT:    vxor.vv v14, v14, v16
-; RV32V-NEXT:    vand.vi v16, v8, 8
-; RV32V-NEXT:    vmul.vv v16, v10, v16
-; RV32V-NEXT:    vxor.vv v14, v14, v16
-; RV32V-NEXT:    vand.vx v16, v8, a2
-; RV32V-NEXT:    li a2, 128
+; RV32V-NEXT:    vand.vi v11, v8, 2
+; RV32V-NEXT:    vand.vi v12, v8, 1
+; RV32V-NEXT:    vmul.vv v11, v10, v11
 ; RV32V-NEXT:    vmul.vv v12, v10, v12
-; RV32V-NEXT:    vxor.vv v12, v14, v12
-; RV32V-NEXT:    vand.vx v14, v8, a1
-; RV32V-NEXT:    vmul.vv v14, v10, v14
-; RV32V-NEXT:    vxor.vv v12, v12, v14
-; RV32V-NEXT:    vand.vx v14, v8, a2
+; RV32V-NEXT:    vxor.vv v11, v12, v11
+; RV32V-NEXT:    vand.vi v12, v8, 4
+; RV32V-NEXT:    vmul.vv v12, v10, v12
+; RV32V-NEXT:    vxor.vv v11, v11, v12
+; RV32V-NEXT:    vand.vi v12, v8, 8
+; RV32V-NEXT:    vmul.vv v12, v10, v12
+; RV32V-NEXT:    vxor.vv v11, v11, v12
+; RV32V-NEXT:    vand.vx v12, v8, a2
+; RV32V-NEXT:    li a2, 128
+; RV32V-NEXT:    vmul.vv v9, v10, v9
+; RV32V-NEXT:    vxor.vv v9, v11, v9
+; RV32V-NEXT:    vand.vx v11, v8, a1
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vand.vx v11, v8, a2
 ; RV32V-NEXT:    li a2, 256
-; RV32V-NEXT:    vmul.vv v16, v10, v16
-; RV32V-NEXT:    vxor.vv v12, v12, v16
-; RV32V-NEXT:    vand.vx v16, v8, a2
+; RV32V-NEXT:    vmul.vv v12, v10, v12
+; RV32V-NEXT:    vxor.vv v9, v9, v12
+; RV32V-NEXT:    vand.vx v12, v8, a2
 ; RV32V-NEXT:    li a2, 512
-; RV32V-NEXT:    vmul.vv v14, v10, v14
-; RV32V-NEXT:    vxor.vv v12, v12, v14
-; RV32V-NEXT:    vand.vx v14, v8, a2
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vand.vx v11, v8, a2
 ; RV32V-NEXT:    li a2, 1024
-; RV32V-NEXT:    vmul.vv v16, v10, v16
-; RV32V-NEXT:    vxor.vv v12, v12, v16
-; RV32V-NEXT:    vand.vx v16, v8, a2
+; RV32V-NEXT:    vmul.vv v12, v10, v12
+; RV32V-NEXT:    vxor.vv v9, v9, v12
+; RV32V-NEXT:    vand.vx v12, v8, a2
+; RV32V-NEXT:    lui a2, 1
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vand.vx v11, v8, a2
 ; RV32V-NEXT:    li a2, 1
 ; RV32V-NEXT:    slli a2, a2, 11
-; RV32V-NEXT:    vmul.vv v14, v10, v14
-; RV32V-NEXT:    vxor.vv v12, v12, v14
-; RV32V-NEXT:    vand.vx v14, v8, a2
-; RV32V-NEXT:    lui a2, 1
-; RV32V-NEXT:    vmul.vv v16, v10, v16
-; RV32V-NEXT:    vxor.vv v12, v12, v16
-; RV32V-NEXT:    vand.vx v16, v8, a2
+; RV32V-NEXT:    vmul.vv v12, v10, v12
+; RV32V-NEXT:    vxor.vv v9, v9, v12
+; RV32V-NEXT:    vand.vx v12, v8, a2
 ; RV32V-NEXT:    lui a2, 2
-; RV32V-NEXT:    vmul.vv v14, v10, v14
-; RV32V-NEXT:    vxor.vv v12, v12, v14
-; RV32V-NEXT:    vand.vx v14, v8, a2
+; RV32V-NEXT:    vmul.vv v12, v10, v12
+; RV32V-NEXT:    vxor.vv v9, v9, v12
+; RV32V-NEXT:    vand.vx v12, v8, a2
 ; RV32V-NEXT:    lui a2, 4
-; RV32V-NEXT:    vmul.vv v16, v10, v16
-; RV32V-NEXT:    vxor.vv v12, v12, v16
-; RV32V-NEXT:    vand.vx v16, v8, a2
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vand.vx v11, v8, a2
 ; RV32V-NEXT:    lui a2, 8
-; RV32V-NEXT:    vmul.vv v14, v10, v14
-; RV32V-NEXT:    vxor.vv v12, v12, v14
-; RV32V-NEXT:    vand.vx v14, v8, a2
+; RV32V-NEXT:    vmul.vv v12, v10, v12
+; RV32V-NEXT:    vxor.vv v9, v9, v12
+; RV32V-NEXT:    vand.vx v12, v8, a2
 ; RV32V-NEXT:    lui a2, 16
-; RV32V-NEXT:    vmul.vv v16, v10, v16
-; RV32V-NEXT:    vxor.vv v12, v12, v16
-; RV32V-NEXT:    vand.vx v16, v8, a2
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vand.vx v11, v8, a2
 ; RV32V-NEXT:    lui a2, 32
-; RV32V-NEXT:    vmul.vv v14, v10, v14
-; RV32V-NEXT:    vxor.vv v12, v12, v14
-; RV32V-NEXT:    vand.vx v14, v8, a2
+; RV32V-NEXT:    vmul.vv v12, v10, v12
+; RV32V-NEXT:    vxor.vv v9, v9, v12
+; RV32V-NEXT:    vand.vx v12, v8, a2
 ; RV32V-NEXT:    lui a2, 64
-; RV32V-NEXT:    vmul.vv v16, v10, v16
-; RV32V-NEXT:    vxor.vv v12, v12, v16
-; RV32V-NEXT:    vand.vx v16, v8, a2
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vand.vx v11, v8, a2
 ; RV32V-NEXT:    lui a2, 128
-; RV32V-NEXT:    vmul.vv v14, v10, v14
-; RV32V-NEXT:    vxor.vv v12, v12, v14
-; RV32V-NEXT:    vand.vx v14, v8, a2
+; RV32V-NEXT:    vmul.vv v12, v10, v12
+; RV32V-NEXT:    vxor.vv v9, v9, v12
+; RV32V-NEXT:    vand.vx v12, v8, a2
 ; RV32V-NEXT:    lui a2, 256
-; RV32V-NEXT:    vmul.vv v16, v10, v16
-; RV32V-NEXT:    vxor.vv v12, v12, v16
-; RV32V-NEXT:    vand.vx v16, v8, a2
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vand.vx v11, v8, a2
 ; RV32V-NEXT:    lui a2, 512
-; RV32V-NEXT:    vmul.vv v14, v10, v14
-; RV32V-NEXT:    vxor.vv v12, v12, v14
-; RV32V-NEXT:    vand.vx v14, v8, a2
+; RV32V-NEXT:    vmul.vv v12, v10, v12
+; RV32V-NEXT:    vxor.vv v9, v9, v12
+; RV32V-NEXT:    vand.vx v12, v8, a2
 ; RV32V-NEXT:    lui a2, 1024
-; RV32V-NEXT:    vmul.vv v16, v10, v16
-; RV32V-NEXT:    vxor.vv v12, v12, v16
-; RV32V-NEXT:    vand.vx v16, v8, a2
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vand.vx v11, v8, a2
 ; RV32V-NEXT:    lui a2, 2048
-; RV32V-NEXT:    vmul.vv v14, v10, v14
-; RV32V-NEXT:    vxor.vv v12, v12, v14
-; RV32V-NEXT:    vand.vx v14, v8, a2
+; RV32V-NEXT:    vmul.vv v12, v10, v12
+; RV32V-NEXT:    vxor.vv v9, v9, v12
+; RV32V-NEXT:    vand.vx v12, v8, a2
 ; RV32V-NEXT:    lui a2, 4096
-; RV32V-NEXT:    vmul.vv v16, v10, v16
-; RV32V-NEXT:    vxor.vv v12, v12, v16
-; RV32V-NEXT:    vand.vx v16, v8, a2
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vand.vx v11, v8, a2
 ; RV32V-NEXT:    lui a2, 8192
-; RV32V-NEXT:    vmul.vv v14, v10, v14
-; RV32V-NEXT:    vxor.vv v12, v12, v14
-; RV32V-NEXT:    vand.vx v14, v8, a2
+; RV32V-NEXT:    vmul.vv v12, v10, v12
+; RV32V-NEXT:    vxor.vv v9, v9, v12
+; RV32V-NEXT:    vand.vx v12, v8, a2
 ; RV32V-NEXT:    lui a2, 16384
-; RV32V-NEXT:    vmul.vv v16, v10, v16
-; RV32V-NEXT:    vxor.vv v12, v12, v16
-; RV32V-NEXT:    vand.vx v16, v8, a2
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vand.vx v11, v8, a2
 ; RV32V-NEXT:    lui a2, 32768
-; RV32V-NEXT:    vmul.vv v14, v10, v14
-; RV32V-NEXT:    vxor.vv v12, v12, v14
-; RV32V-NEXT:    vand.vx v14, v8, a2
+; RV32V-NEXT:    vmul.vv v12, v10, v12
+; RV32V-NEXT:    vxor.vv v9, v9, v12
+; RV32V-NEXT:    vand.vx v12, v8, a2
 ; RV32V-NEXT:    lui a2, 65536
-; RV32V-NEXT:    vmul.vv v16, v10, v16
-; RV32V-NEXT:    vxor.vv v12, v12, v16
-; RV32V-NEXT:    vand.vx v16, v8, a2
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vand.vx v11, v8, a2
 ; RV32V-NEXT:    lui a2, 131072
-; RV32V-NEXT:    vmul.vv v14, v10, v14
-; RV32V-NEXT:    vxor.vv v12, v12, v14
-; RV32V-NEXT:    vand.vx v14, v8, a2
+; RV32V-NEXT:    vmul.vv v12, v10, v12
+; RV32V-NEXT:    vxor.vv v9, v9, v12
+; RV32V-NEXT:    vand.vx v12, v8, a2
 ; RV32V-NEXT:    addi a2, sp, 8
 ; RV32V-NEXT:    sw a0, 8(sp)
 ; RV32V-NEXT:    sw zero, 12(sp)
-; RV32V-NEXT:    vmul.vv v16, v10, v16
-; RV32V-NEXT:    vxor.vv v12, v12, v16
-; RV32V-NEXT:    vlse64.v v16, (a2), zero
-; RV32V-NEXT:    vmul.vv v14, v10, v14
-; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vlse64.v v11, (a2), zero
+; RV32V-NEXT:    vmul.vv v12, v10, v12
+; RV32V-NEXT:    vxor.vv v9, v9, v12
 ; RV32V-NEXT:    lui a0, 262144
-; RV32V-NEXT:    vand.vv v14, v8, v16
+; RV32V-NEXT:    vand.vv v11, v8, v11
 ; RV32V-NEXT:    vand.vx v8, v8, a0
 ; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    vxor.vv v8, v12, v8
-; RV32V-NEXT:    vmul.vv v10, v10, v14
-; RV32V-NEXT:    vxor.vv v10, v8, v10
-; RV32V-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV32V-NEXT:    vnsrl.wx v8, v10, a1
+; RV32V-NEXT:    vxor.vv v8, v9, v8
+; RV32V-NEXT:    vmul.vv v9, v10, v11
+; RV32V-NEXT:    vxor.vv v8, v8, v9
+; RV32V-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV32V-NEXT:    vnsrl.wx v8, v8, a1
 ; RV32V-NEXT:    addi sp, sp, 16
 ; RV32V-NEXT:    ret
 ;
-; RV64V-LABEL: clmulh_nxv2i32_vx:
+; RV64V-LABEL: clmulh_nxv1i32_vv:
 ; RV64V:       # %bb.0:
-; RV64V-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV64V-NEXT:    vmv.v.x v12, a0
-; RV64V-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; RV64V-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; RV64V-NEXT:    vzext.vf2 v10, v8
+; RV64V-NEXT:    vzext.vf2 v8, v9
 ; RV64V-NEXT:    li a1, 16
 ; RV64V-NEXT:    li a0, 32
 ; RV64V-NEXT:    li a2, 64
-; RV64V-NEXT:    vzext.vf2 v8, v12
-; RV64V-NEXT:    vand.vi v12, v8, 2
-; RV64V-NEXT:    vand.vi v14, v8, 1
-; RV64V-NEXT:    vmul.vv v12, v10, v12
-; RV64V-NEXT:    vmul.vv v14, v10, v14
-; RV64V-NEXT:    vxor.vv v12, v14, v12
-; RV64V-NEXT:    vand.vi v14, v8, 4
-; RV64V-NEXT:    vmul.vv v14, v10, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vi v14, v8, 8
-; RV64V-NEXT:    vmul.vv v14, v10, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v8, a1
+; RV64V-NEXT:    vand.vi v9, v8, 2
+; RV64V-NEXT:    vand.vi v11, v8, 1
+; RV64V-NEXT:    vmul.vv v9, v10, v9
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v11, v9
+; RV64V-NEXT:    vand.vi v11, v8, 4
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vi v11, v8, 8
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a1
 ; RV64V-NEXT:    li a1, 128
-; RV64V-NEXT:    vmul.vv v14, v10, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v8, a0
-; RV64V-NEXT:    vmul.vv v14, v10, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v8, a2
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a2
 ; RV64V-NEXT:    li a2, 256
-; RV64V-NEXT:    vmul.vv v14, v10, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v8, a1
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a1
 ; RV64V-NEXT:    li a1, 512
-; RV64V-NEXT:    vmul.vv v14, v10, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v8, a2
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a2
 ; RV64V-NEXT:    li a2, 1024
-; RV64V-NEXT:    vmul.vv v14, v10, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v8, a1
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a1
 ; RV64V-NEXT:    li a1, 1
-; RV64V-NEXT:    vmul.vv v14, v10, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v8, a2
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a2
 ; RV64V-NEXT:    slli a2, a1, 11
-; RV64V-NEXT:    vmul.vv v14, v10, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v8, a2
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a2
 ; RV64V-NEXT:    lui a2, 1
-; RV64V-NEXT:    vmul.vv v14, v10, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v8, a2
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a2
 ; RV64V-NEXT:    lui a2, 2
-; RV64V-NEXT:    vmul.vv v14, v10, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v8, a2
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a2
 ; RV64V-NEXT:    lui a2, 4
-; RV64V-NEXT:    vmul.vv v14, v10, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v8, a2
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a2
 ; RV64V-NEXT:    lui a2, 8
-; RV64V-NEXT:    vmul.vv v14, v10, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v8, a2
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a2
 ; RV64V-NEXT:    lui a2, 16
-; RV64V-NEXT:    vmul.vv v14, v10, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v8, a2
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a2
 ; RV64V-NEXT:    lui a2, 32
-; RV64V-NEXT:    vmul.vv v14, v10, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v8, a2
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a2
 ; RV64V-NEXT:    lui a2, 64
-; RV64V-NEXT:    vmul.vv v14, v10, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v8, a2
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a2
 ; RV64V-NEXT:    lui a2, 128
-; RV64V-NEXT:    vmul.vv v14, v10, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v8, a2
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a2
 ; RV64V-NEXT:    lui a2, 256
-; RV64V-NEXT:    vmul.vv v14, v10, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v8, a2
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a2
 ; RV64V-NEXT:    lui a2, 512
-; RV64V-NEXT:    vmul.vv v14, v10, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v8, a2
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a2
 ; RV64V-NEXT:    lui a2, 1024
-; RV64V-NEXT:    vmul.vv v14, v10, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v8, a2
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a2
 ; RV64V-NEXT:    lui a2, 2048
-; RV64V-NEXT:    vmul.vv v14, v10, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v8, a2
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a2
 ; RV64V-NEXT:    lui a2, 4096
-; RV64V-NEXT:    vmul.vv v14, v10, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v8, a2
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a2
 ; RV64V-NEXT:    lui a2, 8192
-; RV64V-NEXT:    vmul.vv v14, v10, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v8, a2
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a2
 ; RV64V-NEXT:    lui a2, 16384
-; RV64V-NEXT:    vmul.vv v14, v10, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v8, a2
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a2
 ; RV64V-NEXT:    lui a2, 32768
-; RV64V-NEXT:    vmul.vv v14, v10, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v8, a2
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a2
 ; RV64V-NEXT:    lui a2, 65536
-; RV64V-NEXT:    vmul.vv v14, v10, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v8, a2
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a2
 ; RV64V-NEXT:    lui a2, 131072
-; RV64V-NEXT:    vmul.vv v14, v10, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v8, a2
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a2
 ; RV64V-NEXT:    lui a2, 262144
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a2
 ; RV64V-NEXT:    slli a1, a1, 31
-; RV64V-NEXT:    vmul.vv v14, v10, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vand.vx v14, v8, a2
+; RV64V-NEXT:    vmul.vv v11, v10, v11
 ; RV64V-NEXT:    vand.vx v8, v8, a1
-; RV64V-NEXT:    vmul.vv v14, v10, v14
-; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vxor.vv v9, v9, v11
 ; RV64V-NEXT:    vmul.vv v8, v10, v8
-; RV64V-NEXT:    vxor.vv v10, v12, v8
-; RV64V-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV64V-NEXT:    vnsrl.wx v8, v10, a0
+; RV64V-NEXT:    vxor.vv v8, v9, v8
+; RV64V-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV64V-NEXT:    vnsrl.wx v8, v8, a0
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC64-LABEL: clmulh_nxv2i32_vx:
+; RV32ZVBC64-LABEL: clmulh_nxv1i32_vv:
 ; RV32ZVBC64:       # %bb.0:
-; RV32ZVBC64-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV32ZVBC64-NEXT:    vmv.v.x v12, a0
-; RV32ZVBC64-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; RV32ZVBC64-NEXT:    vzext.vf2 v10, v8
-; RV32ZVBC64-NEXT:    vzext.vf2 v8, v12
-; RV32ZVBC64-NEXT:    vclmul.vv v10, v10, v8
+; RV32ZVBC64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf2 v10, v9
+; RV32ZVBC64-NEXT:    vzext.vf2 v9, v8
+; RV32ZVBC64-NEXT:    vclmul.vv v8, v9, v10
 ; RV32ZVBC64-NEXT:    li a0, 32
-; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV32ZVBC64-NEXT:    vnsrl.wx v8, v10, a0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wx v8, v8, a0
 ; RV32ZVBC64-NEXT:    ret
 ;
-; RV64ZVBC64-LABEL: clmulh_nxv2i32_vx:
+; RV64ZVBC64-LABEL: clmulh_nxv1i32_vv:
 ; RV64ZVBC64:       # %bb.0:
-; RV64ZVBC64-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV64ZVBC64-NEXT:    vmv.v.x v12, a0
-; RV64ZVBC64-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; RV64ZVBC64-NEXT:    vzext.vf2 v10, v8
-; RV64ZVBC64-NEXT:    vzext.vf2 v8, v12
-; RV64ZVBC64-NEXT:    vclmul.vv v10, v10, v8
+; RV64ZVBC64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf2 v10, v9
+; RV64ZVBC64-NEXT:    vzext.vf2 v9, v8
+; RV64ZVBC64-NEXT:    vclmul.vv v8, v9, v10
 ; RV64ZVBC64-NEXT:    li a0, 32
-; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVBC64-NEXT:    vnsrl.wx v8, v10, a0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wx v8, v8, a0
 ; RV64ZVBC64-NEXT:    ret
 ;
-; RV32ZVBC32-LABEL: clmulh_nxv2i32_vx:
+; RV32ZVBC32-LABEL: clmulh_nxv1i32_vv:
 ; RV32ZVBC32:       # %bb.0:
-; RV32ZVBC32-NEXT:    addi sp, sp, -16
-; RV32ZVBC32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV32ZVBC32-NEXT:    vmv.v.x v12, a0
-; RV32ZVBC32-NEXT:    lui a0, 524288
-; RV32ZVBC32-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; RV32ZVBC32-NEXT:    vzext.vf2 v10, v8
-; RV32ZVBC32-NEXT:    li a2, 16
-; RV32ZVBC32-NEXT:    li a1, 32
-; RV32ZVBC32-NEXT:    vzext.vf2 v8, v12
-; RV32ZVBC32-NEXT:    vand.vx v12, v8, a2
-; RV32ZVBC32-NEXT:    li a2, 64
-; RV32ZVBC32-NEXT:    vand.vi v14, v8, 2
-; RV32ZVBC32-NEXT:    vand.vi v16, v8, 1
-; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV32ZVBC32-NEXT:    vmul.vv v16, v10, v16
-; RV32ZVBC32-NEXT:    vxor.vv v14, v16, v14
-; RV32ZVBC32-NEXT:    vand.vi v16, v8, 4
-; RV32ZVBC32-NEXT:    vmul.vv v16, v10, v16
-; RV32ZVBC32-NEXT:    vxor.vv v14, v14, v16
-; RV32ZVBC32-NEXT:    vand.vi v16, v8, 8
-; RV32ZVBC32-NEXT:    vmul.vv v16, v10, v16
-; RV32ZVBC32-NEXT:    vxor.vv v14, v14, v16
-; RV32ZVBC32-NEXT:    vand.vx v16, v8, a2
-; RV32ZVBC32-NEXT:    li a2, 128
-; RV32ZVBC32-NEXT:    vmul.vv v12, v10, v12
-; RV32ZVBC32-NEXT:    vxor.vv v12, v14, v12
-; RV32ZVBC32-NEXT:    vand.vx v14, v8, a1
-; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v8, a2
-; RV32ZVBC32-NEXT:    li a2, 256
-; RV32ZVBC32-NEXT:    vmul.vv v16, v10, v16
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
-; RV32ZVBC32-NEXT:    vand.vx v16, v8, a2
-; RV32ZVBC32-NEXT:    li a2, 512
-; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v8, a2
-; RV32ZVBC32-NEXT:    li a2, 1024
-; RV32ZVBC32-NEXT:    vmul.vv v16, v10, v16
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
-; RV32ZVBC32-NEXT:    vand.vx v16, v8, a2
-; RV32ZVBC32-NEXT:    li a2, 1
-; RV32ZVBC32-NEXT:    slli a2, a2, 11
-; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 1
-; RV32ZVBC32-NEXT:    vmul.vv v16, v10, v16
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
-; RV32ZVBC32-NEXT:    vand.vx v16, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 2
-; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 4
-; RV32ZVBC32-NEXT:    vmul.vv v16, v10, v16
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
-; RV32ZVBC32-NEXT:    vand.vx v16, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 8
-; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 16
-; RV32ZVBC32-NEXT:    vmul.vv v16, v10, v16
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
-; RV32ZVBC32-NEXT:    vand.vx v16, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 32
-; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 64
-; RV32ZVBC32-NEXT:    vmul.vv v16, v10, v16
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
-; RV32ZVBC32-NEXT:    vand.vx v16, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 128
-; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 256
-; RV32ZVBC32-NEXT:    vmul.vv v16, v10, v16
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
-; RV32ZVBC32-NEXT:    vand.vx v16, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 512
-; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 1024
-; RV32ZVBC32-NEXT:    vmul.vv v16, v10, v16
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
-; RV32ZVBC32-NEXT:    vand.vx v16, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 2048
-; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 4096
-; RV32ZVBC32-NEXT:    vmul.vv v16, v10, v16
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
-; RV32ZVBC32-NEXT:    vand.vx v16, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 8192
-; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 16384
-; RV32ZVBC32-NEXT:    vmul.vv v16, v10, v16
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
-; RV32ZVBC32-NEXT:    vand.vx v16, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 32768
-; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 65536
-; RV32ZVBC32-NEXT:    vmul.vv v16, v10, v16
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
-; RV32ZVBC32-NEXT:    vand.vx v16, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 131072
-; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    vand.vx v14, v8, a2
-; RV32ZVBC32-NEXT:    addi a2, sp, 8
-; RV32ZVBC32-NEXT:    sw a0, 8(sp)
-; RV32ZVBC32-NEXT:    sw zero, 12(sp)
-; RV32ZVBC32-NEXT:    vmul.vv v16, v10, v16
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v16
-; RV32ZVBC32-NEXT:    vlse64.v v16, (a2), zero
-; RV32ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV32ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV32ZVBC32-NEXT:    lui a0, 262144
-; RV32ZVBC32-NEXT:    vand.vv v14, v8, v16
-; RV32ZVBC32-NEXT:    vand.vx v8, v8, a0
-; RV32ZVBC32-NEXT:    vmul.vv v8, v10, v8
-; RV32ZVBC32-NEXT:    vxor.vv v8, v12, v8
-; RV32ZVBC32-NEXT:    vmul.vv v10, v10, v14
-; RV32ZVBC32-NEXT:    vxor.vv v10, v8, v10
-; RV32ZVBC32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV32ZVBC32-NEXT:    vnsrl.wx v8, v10, a1
-; RV32ZVBC32-NEXT:    addi sp, sp, 16
+; RV32ZVBC32-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; RV32ZVBC32-NEXT:    vclmulh.vv v8, v8, v9
 ; RV32ZVBC32-NEXT:    ret
 ;
-; RV64ZVBC32-LABEL: clmulh_nxv2i32_vx:
+; RV64ZVBC32-LABEL: clmulh_nxv1i32_vv:
 ; RV64ZVBC32:       # %bb.0:
-; RV64ZVBC32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV64ZVBC32-NEXT:    vmv.v.x v12, a0
-; RV64ZVBC32-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; RV64ZVBC32-NEXT:    vzext.vf2 v10, v8
-; RV64ZVBC32-NEXT:    li a1, 16
-; RV64ZVBC32-NEXT:    li a0, 32
-; RV64ZVBC32-NEXT:    li a2, 64
-; RV64ZVBC32-NEXT:    vzext.vf2 v8, v12
-; RV64ZVBC32-NEXT:    vand.vi v12, v8, 2
-; RV64ZVBC32-NEXT:    vand.vi v14, v8, 1
-; RV64ZVBC32-NEXT:    vmul.vv v12, v10, v12
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v14, v12
-; RV64ZVBC32-NEXT:    vand.vi v14, v8, 4
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vi v14, v8, 8
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v8, a1
-; RV64ZVBC32-NEXT:    li a1, 128
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v8, a0
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v8, a2
-; RV64ZVBC32-NEXT:    li a2, 256
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v8, a1
-; RV64ZVBC32-NEXT:    li a1, 512
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v8, a2
-; RV64ZVBC32-NEXT:    li a2, 1024
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v8, a1
-; RV64ZVBC32-NEXT:    li a1, 1
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v8, a2
-; RV64ZVBC32-NEXT:    slli a2, a1, 11
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 1
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 2
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 4
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 8
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 16
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 32
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 64
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 128
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 256
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 512
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 1024
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 2048
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 4096
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 8192
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 16384
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 32768
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 65536
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 131072
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 262144
-; RV64ZVBC32-NEXT:    slli a1, a1, 31
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vand.vx v14, v8, a2
-; RV64ZVBC32-NEXT:    vand.vx v8, v8, a1
-; RV64ZVBC32-NEXT:    vmul.vv v14, v10, v14
-; RV64ZVBC32-NEXT:    vxor.vv v12, v12, v14
-; RV64ZVBC32-NEXT:    vmul.vv v8, v10, v8
-; RV64ZVBC32-NEXT:    vxor.vv v10, v12, v8
-; RV64ZVBC32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVBC32-NEXT:    vnsrl.wx v8, v10, a0
+; RV64ZVBC32-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; RV64ZVBC32-NEXT:    vclmulh.vv v8, v8, v9
 ; RV64ZVBC32-NEXT:    ret
-  %elt.head = insertelement <vscale x 2 x i32> poison, i32 %b, i64 0
-  %vb = shufflevector <vscale x 2 x i32> %elt.head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
-  %va.ext = zext <vscale x 2 x i32> %va to <vscale x 2 x i64>
-  %vb.ext = zext <vscale x 2 x i32> %vb to <vscale x 2 x i64>
-  %clmul = call <vscale x 2 x i64> @llvm.clmul.nxv2i64(<vscale x 2 x i64> %va.ext, <vscale x 2 x i64> %vb.ext)
-  %res.ext = lshr <vscale x 2 x i64> %clmul, splat(i64 32)
-  %res = trunc <vscale x 2 x i64> %res.ext to <vscale x 2 x i32>
-  ret <vscale x 2 x i32> %res
+  %va.ext = zext <vscale x 1 x i32> %va to <vscale x 1 x i64>
+  %vb.ext = zext <vscale x 1 x i32> %vb to <vscale x 1 x i64>
+  %clmul = call <vscale x 1 x i64> @llvm.clmul.nxv1i64(<vscale x 1 x i64> %va.ext, <vscale x 1 x i64> %vb.ext)
+  %res.ext = lshr <vscale x 1 x i64> %clmul, splat(i64 32)
+  %res = trunc <vscale x 1 x i64> %res.ext to <vscale x 1 x i32>
+  ret <vscale x 1 x i32> %res
 }
 
-define <vscale x 4 x i32> @clmulh_nxv4i32_vv(<vscale x 4 x i32> %va, <vscale x 4 x i32> %vb) nounwind {
-; RV32V-LABEL: clmulh_nxv4i32_vv:
+define <vscale x 1 x i32> @clmulh_nxv1i32_vx(<vscale x 1 x i32> %va, i32 %b) nounwind {
+; RV32V-LABEL: clmulh_nxv1i32_vx:
 ; RV32V:       # %bb.0:
 ; RV32V-NEXT:    addi sp, sp, -16
+; RV32V-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
+; RV32V-NEXT:    vmv.v.x v10, a0
 ; RV32V-NEXT:    lui a0, 524288
-; RV32V-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32V-NEXT:    vzext.vf2 v12, v8
-; RV32V-NEXT:    vzext.vf2 v16, v10
+; RV32V-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; RV32V-NEXT:    vzext.vf2 v9, v8
 ; RV32V-NEXT:    li a2, 16
 ; RV32V-NEXT:    li a1, 32
-; RV32V-NEXT:    vand.vx v8, v16, a2
+; RV32V-NEXT:    vzext.vf2 v8, v10
+; RV32V-NEXT:    vand.vx v10, v8, a2
 ; RV32V-NEXT:    li a2, 64
-; RV32V-NEXT:    vand.vi v20, v16, 2
-; RV32V-NEXT:    vand.vi v24, v16, 1
-; RV32V-NEXT:    vmul.vv v20, v12, v20
-; RV32V-NEXT:    vmul.vv v24, v12, v24
-; RV32V-NEXT:    vxor.vv v20, v24, v20
-; RV32V-NEXT:    vand.vi v24, v16, 4
-; RV32V-NEXT:    vmul.vv v24, v12, v24
-; RV32V-NEXT:    vxor.vv v20, v20, v24
-; RV32V-NEXT:    vand.vi v24, v16, 8
-; RV32V-NEXT:    vmul.vv v24, v12, v24
-; RV32V-NEXT:    vxor.vv v20, v20, v24
-; RV32V-NEXT:    vand.vx v24, v16, a2
+; RV32V-NEXT:    vand.vi v11, v8, 2
+; RV32V-NEXT:    vand.vi v12, v8, 1
+; RV32V-NEXT:    vmul.vv v11, v9, v11
+; RV32V-NEXT:    vmul.vv v12, v9, v12
+; RV32V-NEXT:    vxor.vv v11, v12, v11
+; RV32V-NEXT:    vand.vi v12, v8, 4
+; RV32V-NEXT:    vmul.vv v12, v9, v12
+; RV32V-NEXT:    vxor.vv v11, v11, v12
+; RV32V-NEXT:    vand.vi v12, v8, 8
+; RV32V-NEXT:    vmul.vv v12, v9, v12
+; RV32V-NEXT:    vxor.vv v11, v11, v12
+; RV32V-NEXT:    vand.vx v12, v8, a2
 ; RV32V-NEXT:    li a2, 128
-; RV32V-NEXT:    vmul.vv v8, v12, v8
-; RV32V-NEXT:    vxor.vv v8, v20, v8
-; RV32V-NEXT:    vand.vx v20, v16, a1
-; RV32V-NEXT:    vmul.vv v20, v12, v20
-; RV32V-NEXT:    vxor.vv v8, v8, v20
-; RV32V-NEXT:    vand.vx v20, v16, a2
+; RV32V-NEXT:    vmul.vv v10, v9, v10
+; RV32V-NEXT:    vxor.vv v10, v11, v10
+; RV32V-NEXT:    vand.vx v11, v8, a1
+; RV32V-NEXT:    vmul.vv v11, v9, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v8, a2
 ; RV32V-NEXT:    li a2, 256
-; RV32V-NEXT:    vmul.vv v24, v12, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    vand.vx v24, v16, a2
+; RV32V-NEXT:    vmul.vv v12, v9, v12
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vand.vx v12, v8, a2
 ; RV32V-NEXT:    li a2, 512
-; RV32V-NEXT:    vmul.vv v20, v12, v20
-; RV32V-NEXT:    vxor.vv v8, v8, v20
-; RV32V-NEXT:    vand.vx v20, v16, a2
+; RV32V-NEXT:    vmul.vv v11, v9, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v8, a2
 ; RV32V-NEXT:    li a2, 1024
-; RV32V-NEXT:    vmul.vv v24, v12, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    vand.vx v24, v16, a2
-; RV32V-NEXT:    lui a2, 1
-; RV32V-NEXT:    vmul.vv v20, v12, v20
-; RV32V-NEXT:    vxor.vv v8, v8, v20
-; RV32V-NEXT:    vand.vx v20, v16, a2
+; RV32V-NEXT:    vmul.vv v12, v9, v12
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vand.vx v12, v8, a2
 ; RV32V-NEXT:    li a2, 1
 ; RV32V-NEXT:    slli a2, a2, 11
-; RV32V-NEXT:    vmul.vv v24, v12, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    vand.vx v24, v16, a2
+; RV32V-NEXT:    vmul.vv v11, v9, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v8, a2
+; RV32V-NEXT:    lui a2, 1
+; RV32V-NEXT:    vmul.vv v12, v9, v12
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vand.vx v12, v8, a2
 ; RV32V-NEXT:    lui a2, 2
-; RV32V-NEXT:    vmul.vv v24, v12, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    vand.vx v24, v16, a2
+; RV32V-NEXT:    vmul.vv v11, v9, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v8, a2
 ; RV32V-NEXT:    lui a2, 4
-; RV32V-NEXT:    vmul.vv v20, v12, v20
-; RV32V-NEXT:    vxor.vv v8, v8, v20
-; RV32V-NEXT:    vand.vx v20, v16, a2
+; RV32V-NEXT:    vmul.vv v12, v9, v12
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vand.vx v12, v8, a2
 ; RV32V-NEXT:    lui a2, 8
-; RV32V-NEXT:    vmul.vv v24, v12, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    vand.vx v24, v16, a2
+; RV32V-NEXT:    vmul.vv v11, v9, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v8, a2
 ; RV32V-NEXT:    lui a2, 16
-; RV32V-NEXT:    vmul.vv v20, v12, v20
-; RV32V-NEXT:    vxor.vv v8, v8, v20
-; RV32V-NEXT:    vand.vx v20, v16, a2
+; RV32V-NEXT:    vmul.vv v12, v9, v12
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vand.vx v12, v8, a2
 ; RV32V-NEXT:    lui a2, 32
-; RV32V-NEXT:    vmul.vv v24, v12, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    vand.vx v24, v16, a2
+; RV32V-NEXT:    vmul.vv v11, v9, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v8, a2
 ; RV32V-NEXT:    lui a2, 64
-; RV32V-NEXT:    vmul.vv v20, v12, v20
-; RV32V-NEXT:    vxor.vv v8, v8, v20
-; RV32V-NEXT:    vand.vx v20, v16, a2
+; RV32V-NEXT:    vmul.vv v12, v9, v12
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vand.vx v12, v8, a2
 ; RV32V-NEXT:    lui a2, 128
-; RV32V-NEXT:    vmul.vv v24, v12, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    vand.vx v24, v16, a2
+; RV32V-NEXT:    vmul.vv v11, v9, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v8, a2
 ; RV32V-NEXT:    lui a2, 256
-; RV32V-NEXT:    vmul.vv v20, v12, v20
-; RV32V-NEXT:    vxor.vv v8, v8, v20
-; RV32V-NEXT:    vand.vx v20, v16, a2
+; RV32V-NEXT:    vmul.vv v12, v9, v12
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vand.vx v12, v8, a2
 ; RV32V-NEXT:    lui a2, 512
-; RV32V-NEXT:    vmul.vv v24, v12, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    vand.vx v24, v16, a2
+; RV32V-NEXT:    vmul.vv v11, v9, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v8, a2
 ; RV32V-NEXT:    lui a2, 1024
-; RV32V-NEXT:    vmul.vv v20, v12, v20
-; RV32V-NEXT:    vxor.vv v8, v8, v20
-; RV32V-NEXT:    vand.vx v20, v16, a2
+; RV32V-NEXT:    vmul.vv v12, v9, v12
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vand.vx v12, v8, a2
 ; RV32V-NEXT:    lui a2, 2048
-; RV32V-NEXT:    vmul.vv v24, v12, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    vand.vx v24, v16, a2
+; RV32V-NEXT:    vmul.vv v11, v9, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v8, a2
 ; RV32V-NEXT:    lui a2, 4096
-; RV32V-NEXT:    vmul.vv v20, v12, v20
-; RV32V-NEXT:    vxor.vv v8, v8, v20
-; RV32V-NEXT:    vand.vx v20, v16, a2
+; RV32V-NEXT:    vmul.vv v12, v9, v12
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vand.vx v12, v8, a2
 ; RV32V-NEXT:    lui a2, 8192
-; RV32V-NEXT:    vmul.vv v24, v12, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    vand.vx v24, v16, a2
+; RV32V-NEXT:    vmul.vv v11, v9, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v8, a2
 ; RV32V-NEXT:    lui a2, 16384
-; RV32V-NEXT:    vmul.vv v20, v12, v20
-; RV32V-NEXT:    vxor.vv v8, v8, v20
-; RV32V-NEXT:    vand.vx v20, v16, a2
+; RV32V-NEXT:    vmul.vv v12, v9, v12
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vand.vx v12, v8, a2
 ; RV32V-NEXT:    lui a2, 32768
-; RV32V-NEXT:    vmul.vv v24, v12, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    vand.vx v24, v16, a2
+; RV32V-NEXT:    vmul.vv v11, v9, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v8, a2
 ; RV32V-NEXT:    lui a2, 65536
-; RV32V-NEXT:    vmul.vv v20, v12, v20
-; RV32V-NEXT:    vxor.vv v8, v8, v20
-; RV32V-NEXT:    vand.vx v20, v16, a2
+; RV32V-NEXT:    vmul.vv v12, v9, v12
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vand.vx v12, v8, a2
 ; RV32V-NEXT:    lui a2, 131072
-; RV32V-NEXT:    vmul.vv v24, v12, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    vand.vx v24, v16, a2
+; RV32V-NEXT:    vmul.vv v11, v9, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v8, a2
 ; RV32V-NEXT:    addi a2, sp, 8
 ; RV32V-NEXT:    sw a0, 8(sp)
 ; RV32V-NEXT:    sw zero, 12(sp)
-; RV32V-NEXT:    vmul.vv v20, v12, v20
-; RV32V-NEXT:    vxor.vv v8, v8, v20
-; RV32V-NEXT:    vlse64.v v20, (a2), zero
-; RV32V-NEXT:    vmul.vv v24, v12, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vmul.vv v12, v9, v12
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vlse64.v v12, (a2), zero
+; RV32V-NEXT:    vmul.vv v11, v9, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
 ; RV32V-NEXT:    lui a0, 262144
-; RV32V-NEXT:    vand.vv v20, v16, v20
-; RV32V-NEXT:    vand.vx v16, v16, a0
-; RV32V-NEXT:    vmul.vv v16, v12, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    vmul.vv v12, v12, v20
-; RV32V-NEXT:    vxor.vv v12, v8, v12
-; RV32V-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV32V-NEXT:    vnsrl.wx v8, v12, a1
+; RV32V-NEXT:    vand.vv v11, v8, v12
+; RV32V-NEXT:    vand.vx v8, v8, a0
+; RV32V-NEXT:    vmul.vv v8, v9, v8
+; RV32V-NEXT:    vxor.vv v8, v10, v8
+; RV32V-NEXT:    vmul.vv v9, v9, v11
+; RV32V-NEXT:    vxor.vv v8, v8, v9
+; RV32V-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV32V-NEXT:    vnsrl.wx v8, v8, a1
 ; RV32V-NEXT:    addi sp, sp, 16
 ; RV32V-NEXT:    ret
 ;
-; RV64V-LABEL: clmulh_nxv4i32_vv:
+; RV64V-LABEL: clmulh_nxv1i32_vx:
 ; RV64V:       # %bb.0:
-; RV64V-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV64V-NEXT:    vzext.vf2 v12, v8
-; RV64V-NEXT:    vzext.vf2 v16, v10
+; RV64V-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
+; RV64V-NEXT:    vmv.v.x v10, a0
+; RV64V-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; RV64V-NEXT:    vzext.vf2 v9, v8
 ; RV64V-NEXT:    li a1, 16
 ; RV64V-NEXT:    li a0, 32
 ; RV64V-NEXT:    li a2, 64
-; RV64V-NEXT:    vand.vi v8, v16, 2
-; RV64V-NEXT:    vand.vi v20, v16, 1
-; RV64V-NEXT:    vmul.vv v8, v12, v8
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v8, v20, v8
-; RV64V-NEXT:    vand.vi v20, v16, 4
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v8, v8, v20
-; RV64V-NEXT:    vand.vi v20, v16, 8
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v8, v8, v20
-; RV64V-NEXT:    vand.vx v20, v16, a1
+; RV64V-NEXT:    vzext.vf2 v8, v10
+; RV64V-NEXT:    vand.vi v10, v8, 2
+; RV64V-NEXT:    vand.vi v11, v8, 1
+; RV64V-NEXT:    vmul.vv v10, v9, v10
+; RV64V-NEXT:    vmul.vv v11, v9, v11
+; RV64V-NEXT:    vxor.vv v10, v11, v10
+; RV64V-NEXT:    vand.vi v11, v8, 4
+; RV64V-NEXT:    vmul.vv v11, v9, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vi v11, v8, 8
+; RV64V-NEXT:    vmul.vv v11, v9, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v8, a1
 ; RV64V-NEXT:    li a1, 128
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v8, v8, v20
-; RV64V-NEXT:    vand.vx v20, v16, a0
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v8, v8, v20
-; RV64V-NEXT:    vand.vx v20, v16, a2
+; RV64V-NEXT:    vmul.vv v11, v9, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    vmul.vv v11, v9, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v8, a2
 ; RV64V-NEXT:    li a2, 256
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v8, v8, v20
-; RV64V-NEXT:    vand.vx v20, v16, a1
+; RV64V-NEXT:    vmul.vv v11, v9, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v8, a1
 ; RV64V-NEXT:    li a1, 512
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v8, v8, v20
-; RV64V-NEXT:    vand.vx v20, v16, a2
+; RV64V-NEXT:    vmul.vv v11, v9, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v8, a2
 ; RV64V-NEXT:    li a2, 1024
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v8, v8, v20
-; RV64V-NEXT:    vand.vx v20, v16, a1
+; RV64V-NEXT:    vmul.vv v11, v9, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v8, a1
 ; RV64V-NEXT:    li a1, 1
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v8, v8, v20
-; RV64V-NEXT:    vand.vx v20, v16, a2
+; RV64V-NEXT:    vmul.vv v11, v9, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v8, a2
 ; RV64V-NEXT:    slli a2, a1, 11
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v8, v8, v20
-; RV64V-NEXT:    vand.vx v20, v16, a2
+; RV64V-NEXT:    vmul.vv v11, v9, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v8, a2
 ; RV64V-NEXT:    lui a2, 1
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v8, v8, v20
-; RV64V-NEXT:    vand.vx v20, v16, a2
+; RV64V-NEXT:    vmul.vv v11, v9, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v8, a2
 ; RV64V-NEXT:    lui a2, 2
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v8, v8, v20
-; RV64V-NEXT:    vand.vx v20, v16, a2
+; RV64V-NEXT:    vmul.vv v11, v9, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v8, a2
 ; RV64V-NEXT:    lui a2, 4
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v8, v8, v20
-; RV64V-NEXT:    vand.vx v20, v16, a2
+; RV64V-NEXT:    vmul.vv v11, v9, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v8, a2
 ; RV64V-NEXT:    lui a2, 8
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v8, v8, v20
-; RV64V-NEXT:    vand.vx v20, v16, a2
+; RV64V-NEXT:    vmul.vv v11, v9, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v8, a2
 ; RV64V-NEXT:    lui a2, 16
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v8, v8, v20
-; RV64V-NEXT:    vand.vx v20, v16, a2
+; RV64V-NEXT:    vmul.vv v11, v9, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v8, a2
 ; RV64V-NEXT:    lui a2, 32
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v8, v8, v20
-; RV64V-NEXT:    vand.vx v20, v16, a2
+; RV64V-NEXT:    vmul.vv v11, v9, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v8, a2
 ; RV64V-NEXT:    lui a2, 64
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v8, v8, v20
-; RV64V-NEXT:    vand.vx v20, v16, a2
+; RV64V-NEXT:    vmul.vv v11, v9, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v8, a2
 ; RV64V-NEXT:    lui a2, 128
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v8, v8, v20
-; RV64V-NEXT:    vand.vx v20, v16, a2
+; RV64V-NEXT:    vmul.vv v11, v9, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v8, a2
 ; RV64V-NEXT:    lui a2, 256
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v8, v8, v20
-; RV64V-NEXT:    vand.vx v20, v16, a2
+; RV64V-NEXT:    vmul.vv v11, v9, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v8, a2
 ; RV64V-NEXT:    lui a2, 512
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v8, v8, v20
-; RV64V-NEXT:    vand.vx v20, v16, a2
+; RV64V-NEXT:    vmul.vv v11, v9, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v8, a2
 ; RV64V-NEXT:    lui a2, 1024
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v8, v8, v20
-; RV64V-NEXT:    vand.vx v20, v16, a2
+; RV64V-NEXT:    vmul.vv v11, v9, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v8, a2
 ; RV64V-NEXT:    lui a2, 2048
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v8, v8, v20
-; RV64V-NEXT:    vand.vx v20, v16, a2
+; RV64V-NEXT:    vmul.vv v11, v9, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v8, a2
 ; RV64V-NEXT:    lui a2, 4096
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v8, v8, v20
-; RV64V-NEXT:    vand.vx v20, v16, a2
+; RV64V-NEXT:    vmul.vv v11, v9, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v8, a2
 ; RV64V-NEXT:    lui a2, 8192
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v8, v8, v20
-; RV64V-NEXT:    vand.vx v20, v16, a2
+; RV64V-NEXT:    vmul.vv v11, v9, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v8, a2
 ; RV64V-NEXT:    lui a2, 16384
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v8, v8, v20
-; RV64V-NEXT:    vand.vx v20, v16, a2
+; RV64V-NEXT:    vmul.vv v11, v9, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v8, a2
 ; RV64V-NEXT:    lui a2, 32768
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v8, v8, v20
-; RV64V-NEXT:    vand.vx v20, v16, a2
+; RV64V-NEXT:    vmul.vv v11, v9, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v8, a2
 ; RV64V-NEXT:    lui a2, 65536
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v8, v8, v20
-; RV64V-NEXT:    vand.vx v20, v16, a2
+; RV64V-NEXT:    vmul.vv v11, v9, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v8, a2
 ; RV64V-NEXT:    lui a2, 131072
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v8, v8, v20
-; RV64V-NEXT:    vand.vx v20, v16, a2
+; RV64V-NEXT:    vmul.vv v11, v9, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v8, a2
 ; RV64V-NEXT:    lui a2, 262144
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v8, v8, v20
-; RV64V-NEXT:    vand.vx v20, v16, a2
 ; RV64V-NEXT:    slli a1, a1, 31
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vand.vx v16, v16, a1
-; RV64V-NEXT:    vxor.vv v8, v8, v20
-; RV64V-NEXT:    vmul.vv v12, v12, v16
-; RV64V-NEXT:    vxor.vv v12, v8, v12
-; RV64V-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV64V-NEXT:    vnsrl.wx v8, v12, a0
+; RV64V-NEXT:    vmul.vv v11, v9, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v8, a2
+; RV64V-NEXT:    vand.vx v8, v8, a1
+; RV64V-NEXT:    vmul.vv v11, v9, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vmul.vv v8, v9, v8
+; RV64V-NEXT:    vxor.vv v8, v10, v8
+; RV64V-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV64V-NEXT:    vnsrl.wx v8, v8, a0
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC64-LABEL: clmulh_nxv4i32_vv:
+; RV32ZVBC64-LABEL: clmulh_nxv1i32_vx:
 ; RV32ZVBC64:       # %bb.0:
-; RV32ZVBC64-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV32ZVBC64-NEXT:    vzext.vf2 v12, v10
-; RV32ZVBC64-NEXT:    vzext.vf2 v16, v8
-; RV32ZVBC64-NEXT:    vclmul.vv v12, v16, v12
+; RV32ZVBC64-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
+; RV32ZVBC64-NEXT:    vmv.v.x v9, a0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf2 v10, v8
+; RV32ZVBC64-NEXT:    vzext.vf2 v8, v9
+; RV32ZVBC64-NEXT:    vclmul.vv v8, v10, v8
 ; RV32ZVBC64-NEXT:    li a0, 32
-; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV32ZVBC64-NEXT:    vnsrl.wx v8, v12, a0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wx v8, v8, a0
 ; RV32ZVBC64-NEXT:    ret
 ;
-; RV64ZVBC64-LABEL: clmulh_nxv4i32_vv:
+; RV64ZVBC64-LABEL: clmulh_nxv1i32_vx:
 ; RV64ZVBC64:       # %bb.0:
-; RV64ZVBC64-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV64ZVBC64-NEXT:    vzext.vf2 v12, v10
-; RV64ZVBC64-NEXT:    vzext.vf2 v16, v8
-; RV64ZVBC64-NEXT:    vclmul.vv v12, v16, v12
+; RV64ZVBC64-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
+; RV64ZVBC64-NEXT:    vmv.v.x v9, a0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf2 v10, v8
+; RV64ZVBC64-NEXT:    vzext.vf2 v8, v9
+; RV64ZVBC64-NEXT:    vclmul.vv v8, v10, v8
 ; RV64ZVBC64-NEXT:    li a0, 32
-; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV64ZVBC64-NEXT:    vnsrl.wx v8, v12, a0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wx v8, v8, a0
 ; RV64ZVBC64-NEXT:    ret
 ;
-; RV32ZVBC32-LABEL: clmulh_nxv4i32_vv:
+; RV32ZVBC32-LABEL: clmulh_nxv1i32_vx:
 ; RV32ZVBC32:       # %bb.0:
-; RV32ZVBC32-NEXT:    addi sp, sp, -16
-; RV32ZVBC32-NEXT:    lui a0, 524288
-; RV32ZVBC32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32ZVBC32-NEXT:    vzext.vf2 v12, v8
-; RV32ZVBC32-NEXT:    vzext.vf2 v16, v10
-; RV32ZVBC32-NEXT:    li a2, 16
-; RV32ZVBC32-NEXT:    li a1, 32
-; RV32ZVBC32-NEXT:    vand.vx v8, v16, a2
-; RV32ZVBC32-NEXT:    li a2, 64
-; RV32ZVBC32-NEXT:    vand.vi v20, v16, 2
-; RV32ZVBC32-NEXT:    vand.vi v24, v16, 1
-; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV32ZVBC32-NEXT:    vmul.vv v24, v12, v24
-; RV32ZVBC32-NEXT:    vxor.vv v20, v24, v20
-; RV32ZVBC32-NEXT:    vand.vi v24, v16, 4
-; RV32ZVBC32-NEXT:    vmul.vv v24, v12, v24
-; RV32ZVBC32-NEXT:    vxor.vv v20, v20, v24
-; RV32ZVBC32-NEXT:    vand.vi v24, v16, 8
-; RV32ZVBC32-NEXT:    vmul.vv v24, v12, v24
-; RV32ZVBC32-NEXT:    vxor.vv v20, v20, v24
-; RV32ZVBC32-NEXT:    vand.vx v24, v16, a2
-; RV32ZVBC32-NEXT:    li a2, 128
-; RV32ZVBC32-NEXT:    vmul.vv v8, v12, v8
-; RV32ZVBC32-NEXT:    vxor.vv v8, v20, v8
-; RV32ZVBC32-NEXT:    vand.vx v20, v16, a1
-; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v20
-; RV32ZVBC32-NEXT:    vand.vx v20, v16, a2
-; RV32ZVBC32-NEXT:    li a2, 256
-; RV32ZVBC32-NEXT:    vmul.vv v24, v12, v24
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v24
-; RV32ZVBC32-NEXT:    vand.vx v24, v16, a2
-; RV32ZVBC32-NEXT:    li a2, 512
-; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v20
-; RV32ZVBC32-NEXT:    vand.vx v20, v16, a2
-; RV32ZVBC32-NEXT:    li a2, 1024
-; RV32ZVBC32-NEXT:    vmul.vv v24, v12, v24
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v24
-; RV32ZVBC32-NEXT:    vand.vx v24, v16, a2
-; RV32ZVBC32-NEXT:    lui a2, 1
-; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v20
-; RV32ZVBC32-NEXT:    vand.vx v20, v16, a2
-; RV32ZVBC32-NEXT:    li a2, 1
-; RV32ZVBC32-NEXT:    slli a2, a2, 11
-; RV32ZVBC32-NEXT:    vmul.vv v24, v12, v24
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v24
-; RV32ZVBC32-NEXT:    vand.vx v24, v16, a2
-; RV32ZVBC32-NEXT:    lui a2, 2
-; RV32ZVBC32-NEXT:    vmul.vv v24, v12, v24
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v24
-; RV32ZVBC32-NEXT:    vand.vx v24, v16, a2
-; RV32ZVBC32-NEXT:    lui a2, 4
-; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v20
-; RV32ZVBC32-NEXT:    vand.vx v20, v16, a2
-; RV32ZVBC32-NEXT:    lui a2, 8
-; RV32ZVBC32-NEXT:    vmul.vv v24, v12, v24
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v24
-; RV32ZVBC32-NEXT:    vand.vx v24, v16, a2
-; RV32ZVBC32-NEXT:    lui a2, 16
-; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v20
-; RV32ZVBC32-NEXT:    vand.vx v20, v16, a2
-; RV32ZVBC32-NEXT:    lui a2, 32
-; RV32ZVBC32-NEXT:    vmul.vv v24, v12, v24
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v24
-; RV32ZVBC32-NEXT:    vand.vx v24, v16, a2
-; RV32ZVBC32-NEXT:    lui a2, 64
-; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v20
-; RV32ZVBC32-NEXT:    vand.vx v20, v16, a2
-; RV32ZVBC32-NEXT:    lui a2, 128
-; RV32ZVBC32-NEXT:    vmul.vv v24, v12, v24
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v24
-; RV32ZVBC32-NEXT:    vand.vx v24, v16, a2
-; RV32ZVBC32-NEXT:    lui a2, 256
-; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v20
-; RV32ZVBC32-NEXT:    vand.vx v20, v16, a2
-; RV32ZVBC32-NEXT:    lui a2, 512
-; RV32ZVBC32-NEXT:    vmul.vv v24, v12, v24
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v24
-; RV32ZVBC32-NEXT:    vand.vx v24, v16, a2
-; RV32ZVBC32-NEXT:    lui a2, 1024
-; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v20
-; RV32ZVBC32-NEXT:    vand.vx v20, v16, a2
-; RV32ZVBC32-NEXT:    lui a2, 2048
-; RV32ZVBC32-NEXT:    vmul.vv v24, v12, v24
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v24
-; RV32ZVBC32-NEXT:    vand.vx v24, v16, a2
-; RV32ZVBC32-NEXT:    lui a2, 4096
-; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v20
-; RV32ZVBC32-NEXT:    vand.vx v20, v16, a2
-; RV32ZVBC32-NEXT:    lui a2, 8192
-; RV32ZVBC32-NEXT:    vmul.vv v24, v12, v24
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v24
-; RV32ZVBC32-NEXT:    vand.vx v24, v16, a2
-; RV32ZVBC32-NEXT:    lui a2, 16384
-; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v20
-; RV32ZVBC32-NEXT:    vand.vx v20, v16, a2
-; RV32ZVBC32-NEXT:    lui a2, 32768
-; RV32ZVBC32-NEXT:    vmul.vv v24, v12, v24
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v24
-; RV32ZVBC32-NEXT:    vand.vx v24, v16, a2
-; RV32ZVBC32-NEXT:    lui a2, 65536
-; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v20
-; RV32ZVBC32-NEXT:    vand.vx v20, v16, a2
-; RV32ZVBC32-NEXT:    lui a2, 131072
-; RV32ZVBC32-NEXT:    vmul.vv v24, v12, v24
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v24
-; RV32ZVBC32-NEXT:    vand.vx v24, v16, a2
-; RV32ZVBC32-NEXT:    addi a2, sp, 8
-; RV32ZVBC32-NEXT:    sw a0, 8(sp)
-; RV32ZVBC32-NEXT:    sw zero, 12(sp)
-; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v20
-; RV32ZVBC32-NEXT:    vlse64.v v20, (a2), zero
-; RV32ZVBC32-NEXT:    vmul.vv v24, v12, v24
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v24
-; RV32ZVBC32-NEXT:    lui a0, 262144
-; RV32ZVBC32-NEXT:    vand.vv v20, v16, v20
-; RV32ZVBC32-NEXT:    vand.vx v16, v16, a0
-; RV32ZVBC32-NEXT:    vmul.vv v16, v12, v16
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v16
-; RV32ZVBC32-NEXT:    vmul.vv v12, v12, v20
-; RV32ZVBC32-NEXT:    vxor.vv v12, v8, v12
-; RV32ZVBC32-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV32ZVBC32-NEXT:    vnsrl.wx v8, v12, a1
-; RV32ZVBC32-NEXT:    addi sp, sp, 16
+; RV32ZVBC32-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
+; RV32ZVBC32-NEXT:    vclmulh.vx v8, v8, a0
 ; RV32ZVBC32-NEXT:    ret
 ;
-; RV64ZVBC32-LABEL: clmulh_nxv4i32_vv:
+; RV64ZVBC32-LABEL: clmulh_nxv1i32_vx:
 ; RV64ZVBC32:       # %bb.0:
-; RV64ZVBC32-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV64ZVBC32-NEXT:    vzext.vf2 v12, v8
-; RV64ZVBC32-NEXT:    vzext.vf2 v16, v10
-; RV64ZVBC32-NEXT:    li a1, 16
-; RV64ZVBC32-NEXT:    li a0, 32
-; RV64ZVBC32-NEXT:    li a2, 64
-; RV64ZVBC32-NEXT:    vand.vi v8, v16, 2
-; RV64ZVBC32-NEXT:    vand.vi v20, v16, 1
-; RV64ZVBC32-NEXT:    vmul.vv v8, v12, v8
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v8, v20, v8
-; RV64ZVBC32-NEXT:    vand.vi v20, v16, 4
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
-; RV64ZVBC32-NEXT:    vand.vi v20, v16, 8
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v16, a1
-; RV64ZVBC32-NEXT:    li a1, 128
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v16, a0
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v16, a2
-; RV64ZVBC32-NEXT:    li a2, 256
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v16, a1
-; RV64ZVBC32-NEXT:    li a1, 512
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v16, a2
-; RV64ZVBC32-NEXT:    li a2, 1024
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v16, a1
-; RV64ZVBC32-NEXT:    li a1, 1
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v16, a2
-; RV64ZVBC32-NEXT:    slli a2, a1, 11
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v16, a2
-; RV64ZVBC32-NEXT:    lui a2, 1
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v16, a2
-; RV64ZVBC32-NEXT:    lui a2, 2
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v16, a2
-; RV64ZVBC32-NEXT:    lui a2, 4
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v16, a2
-; RV64ZVBC32-NEXT:    lui a2, 8
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v16, a2
-; RV64ZVBC32-NEXT:    lui a2, 16
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v16, a2
-; RV64ZVBC32-NEXT:    lui a2, 32
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v16, a2
-; RV64ZVBC32-NEXT:    lui a2, 64
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v16, a2
-; RV64ZVBC32-NEXT:    lui a2, 128
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v16, a2
-; RV64ZVBC32-NEXT:    lui a2, 256
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v16, a2
-; RV64ZVBC32-NEXT:    lui a2, 512
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v16, a2
-; RV64ZVBC32-NEXT:    lui a2, 1024
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v16, a2
-; RV64ZVBC32-NEXT:    lui a2, 2048
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v16, a2
-; RV64ZVBC32-NEXT:    lui a2, 4096
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v16, a2
-; RV64ZVBC32-NEXT:    lui a2, 8192
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v16, a2
-; RV64ZVBC32-NEXT:    lui a2, 16384
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v16, a2
-; RV64ZVBC32-NEXT:    lui a2, 32768
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v16, a2
-; RV64ZVBC32-NEXT:    lui a2, 65536
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v16, a2
-; RV64ZVBC32-NEXT:    lui a2, 131072
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v16, a2
-; RV64ZVBC32-NEXT:    lui a2, 262144
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v16, a2
-; RV64ZVBC32-NEXT:    slli a1, a1, 31
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vand.vx v16, v16, a1
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v20
-; RV64ZVBC32-NEXT:    vmul.vv v12, v12, v16
-; RV64ZVBC32-NEXT:    vxor.vv v12, v8, v12
-; RV64ZVBC32-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV64ZVBC32-NEXT:    vnsrl.wx v8, v12, a0
+; RV64ZVBC32-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
+; RV64ZVBC32-NEXT:    vclmulh.vx v8, v8, a0
 ; RV64ZVBC32-NEXT:    ret
-  %va.ext = zext <vscale x 4 x i32> %va to <vscale x 4 x i64>
-  %vb.ext = zext <vscale x 4 x i32> %vb to <vscale x 4 x i64>
-  %clmul = call <vscale x 4 x i64> @llvm.clmul.nxv4i64(<vscale x 4 x i64> %va.ext, <vscale x 4 x i64> %vb.ext)
-  %res.ext = lshr <vscale x 4 x i64> %clmul, splat(i64 32)
-  %res = trunc <vscale x 4 x i64> %res.ext to <vscale x 4 x i32>
-  ret <vscale x 4 x i32> %res
+  %elt.head = insertelement <vscale x 1 x i32> poison, i32 %b, i64 0
+  %vb = shufflevector <vscale x 1 x i32> %elt.head, <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
+  %va.ext = zext <vscale x 1 x i32> %va to <vscale x 1 x i64>
+  %vb.ext = zext <vscale x 1 x i32> %vb to <vscale x 1 x i64>
+  %clmul = call <vscale x 1 x i64> @llvm.clmul.nxv1i64(<vscale x 1 x i64> %va.ext, <vscale x 1 x i64> %vb.ext)
+  %res.ext = lshr <vscale x 1 x i64> %clmul, splat(i64 32)
+  %res = trunc <vscale x 1 x i64> %res.ext to <vscale x 1 x i32>
+  ret <vscale x 1 x i32> %res
 }
 
-define <vscale x 4 x i32> @clmulh_nxv4i32_vx(<vscale x 4 x i32> %va, i32 %b) nounwind {
-; RV32V-LABEL: clmulh_nxv4i32_vx:
+define <vscale x 2 x i32> @clmulh_nxv2i32_vv(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb) nounwind {
+; RV32V-LABEL: clmulh_nxv2i32_vv:
 ; RV32V:       # %bb.0:
 ; RV32V-NEXT:    addi sp, sp, -16
-; RV32V-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV32V-NEXT:    vmv.v.x v16, a0
 ; RV32V-NEXT:    lui a0, 524288
-; RV32V-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
-; RV32V-NEXT:    vzext.vf2 v12, v8
+; RV32V-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32V-NEXT:    vzext.vf2 v10, v8
+; RV32V-NEXT:    vzext.vf2 v12, v9
 ; RV32V-NEXT:    li a2, 16
 ; RV32V-NEXT:    li a1, 32
-; RV32V-NEXT:    vzext.vf2 v8, v16
-; RV32V-NEXT:    vand.vx v16, v8, a2
+; RV32V-NEXT:    vand.vx v8, v12, a2
 ; RV32V-NEXT:    li a2, 64
-; RV32V-NEXT:    vand.vi v20, v8, 2
-; RV32V-NEXT:    vand.vi v24, v8, 1
-; RV32V-NEXT:    vmul.vv v20, v12, v20
-; RV32V-NEXT:    vmul.vv v24, v12, v24
-; RV32V-NEXT:    vxor.vv v20, v24, v20
-; RV32V-NEXT:    vand.vi v24, v8, 4
-; RV32V-NEXT:    vmul.vv v24, v12, v24
-; RV32V-NEXT:    vxor.vv v20, v20, v24
-; RV32V-NEXT:    vand.vi v24, v8, 8
-; RV32V-NEXT:    vmul.vv v24, v12, v24
-; RV32V-NEXT:    vxor.vv v20, v20, v24
-; RV32V-NEXT:    vand.vx v24, v8, a2
+; RV32V-NEXT:    vand.vi v14, v12, 2
+; RV32V-NEXT:    vand.vi v16, v12, 1
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vmul.vv v16, v10, v16
+; RV32V-NEXT:    vxor.vv v14, v16, v14
+; RV32V-NEXT:    vand.vi v16, v12, 4
+; RV32V-NEXT:    vmul.vv v16, v10, v16
+; RV32V-NEXT:    vxor.vv v14, v14, v16
+; RV32V-NEXT:    vand.vi v16, v12, 8
+; RV32V-NEXT:    vmul.vv v16, v10, v16
+; RV32V-NEXT:    vxor.vv v14, v14, v16
+; RV32V-NEXT:    vand.vx v16, v12, a2
 ; RV32V-NEXT:    li a2, 128
-; RV32V-NEXT:    vmul.vv v16, v12, v16
-; RV32V-NEXT:    vxor.vv v16, v20, v16
-; RV32V-NEXT:    vand.vx v20, v8, a1
-; RV32V-NEXT:    vmul.vv v20, v12, v20
-; RV32V-NEXT:    vxor.vv v16, v16, v20
-; RV32V-NEXT:    vand.vx v20, v8, a2
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    vxor.vv v8, v14, v8
+; RV32V-NEXT:    vand.vx v14, v12, a1
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v8, v8, v14
+; RV32V-NEXT:    vand.vx v14, v12, a2
 ; RV32V-NEXT:    li a2, 256
-; RV32V-NEXT:    vmul.vv v24, v12, v24
-; RV32V-NEXT:    vxor.vv v16, v16, v24
-; RV32V-NEXT:    vand.vx v24, v8, a2
+; RV32V-NEXT:    vmul.vv v16, v10, v16
+; RV32V-NEXT:    vxor.vv v8, v8, v16
+; RV32V-NEXT:    vand.vx v16, v12, a2
 ; RV32V-NEXT:    li a2, 512
-; RV32V-NEXT:    vmul.vv v20, v12, v20
-; RV32V-NEXT:    vxor.vv v16, v16, v20
-; RV32V-NEXT:    vand.vx v20, v8, a2
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v8, v8, v14
+; RV32V-NEXT:    vand.vx v14, v12, a2
 ; RV32V-NEXT:    li a2, 1024
-; RV32V-NEXT:    vmul.vv v24, v12, v24
-; RV32V-NEXT:    vxor.vv v16, v16, v24
-; RV32V-NEXT:    vand.vx v24, v8, a2
+; RV32V-NEXT:    vmul.vv v16, v10, v16
+; RV32V-NEXT:    vxor.vv v8, v8, v16
+; RV32V-NEXT:    vand.vx v16, v12, a2
+; RV32V-NEXT:    lui a2, 1
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v8, v8, v14
+; RV32V-NEXT:    vand.vx v14, v12, a2
 ; RV32V-NEXT:    li a2, 1
 ; RV32V-NEXT:    slli a2, a2, 11
-; RV32V-NEXT:    vmul.vv v20, v12, v20
-; RV32V-NEXT:    vxor.vv v16, v16, v20
-; RV32V-NEXT:    vand.vx v20, v8, a2
-; RV32V-NEXT:    lui a2, 1
-; RV32V-NEXT:    vmul.vv v24, v12, v24
-; RV32V-NEXT:    vxor.vv v16, v16, v24
-; RV32V-NEXT:    vand.vx v24, v8, a2
+; RV32V-NEXT:    vmul.vv v16, v10, v16
+; RV32V-NEXT:    vxor.vv v8, v8, v16
+; RV32V-NEXT:    vand.vx v16, v12, a2
 ; RV32V-NEXT:    lui a2, 2
-; RV32V-NEXT:    vmul.vv v20, v12, v20
-; RV32V-NEXT:    vxor.vv v16, v16, v20
-; RV32V-NEXT:    vand.vx v20, v8, a2
+; RV32V-NEXT:    vmul.vv v16, v10, v16
+; RV32V-NEXT:    vxor.vv v8, v8, v16
+; RV32V-NEXT:    vand.vx v16, v12, a2
 ; RV32V-NEXT:    lui a2, 4
-; RV32V-NEXT:    vmul.vv v24, v12, v24
-; RV32V-NEXT:    vxor.vv v16, v16, v24
-; RV32V-NEXT:    vand.vx v24, v8, a2
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v8, v8, v14
+; RV32V-NEXT:    vand.vx v14, v12, a2
 ; RV32V-NEXT:    lui a2, 8
-; RV32V-NEXT:    vmul.vv v20, v12, v20
-; RV32V-NEXT:    vxor.vv v16, v16, v20
-; RV32V-NEXT:    vand.vx v20, v8, a2
+; RV32V-NEXT:    vmul.vv v16, v10, v16
+; RV32V-NEXT:    vxor.vv v8, v8, v16
+; RV32V-NEXT:    vand.vx v16, v12, a2
 ; RV32V-NEXT:    lui a2, 16
-; RV32V-NEXT:    vmul.vv v24, v12, v24
-; RV32V-NEXT:    vxor.vv v16, v16, v24
-; RV32V-NEXT:    vand.vx v24, v8, a2
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v8, v8, v14
+; RV32V-NEXT:    vand.vx v14, v12, a2
 ; RV32V-NEXT:    lui a2, 32
-; RV32V-NEXT:    vmul.vv v20, v12, v20
-; RV32V-NEXT:    vxor.vv v16, v16, v20
-; RV32V-NEXT:    vand.vx v20, v8, a2
+; RV32V-NEXT:    vmul.vv v16, v10, v16
+; RV32V-NEXT:    vxor.vv v8, v8, v16
+; RV32V-NEXT:    vand.vx v16, v12, a2
 ; RV32V-NEXT:    lui a2, 64
-; RV32V-NEXT:    vmul.vv v24, v12, v24
-; RV32V-NEXT:    vxor.vv v16, v16, v24
-; RV32V-NEXT:    vand.vx v24, v8, a2
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v8, v8, v14
+; RV32V-NEXT:    vand.vx v14, v12, a2
 ; RV32V-NEXT:    lui a2, 128
-; RV32V-NEXT:    vmul.vv v20, v12, v20
-; RV32V-NEXT:    vxor.vv v16, v16, v20
-; RV32V-NEXT:    vand.vx v20, v8, a2
+; RV32V-NEXT:    vmul.vv v16, v10, v16
+; RV32V-NEXT:    vxor.vv v8, v8, v16
+; RV32V-NEXT:    vand.vx v16, v12, a2
 ; RV32V-NEXT:    lui a2, 256
-; RV32V-NEXT:    vmul.vv v24, v12, v24
-; RV32V-NEXT:    vxor.vv v16, v16, v24
-; RV32V-NEXT:    vand.vx v24, v8, a2
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v8, v8, v14
+; RV32V-NEXT:    vand.vx v14, v12, a2
 ; RV32V-NEXT:    lui a2, 512
-; RV32V-NEXT:    vmul.vv v20, v12, v20
-; RV32V-NEXT:    vxor.vv v16, v16, v20
-; RV32V-NEXT:    vand.vx v20, v8, a2
+; RV32V-NEXT:    vmul.vv v16, v10, v16
+; RV32V-NEXT:    vxor.vv v8, v8, v16
+; RV32V-NEXT:    vand.vx v16, v12, a2
 ; RV32V-NEXT:    lui a2, 1024
-; RV32V-NEXT:    vmul.vv v24, v12, v24
-; RV32V-NEXT:    vxor.vv v16, v16, v24
-; RV32V-NEXT:    vand.vx v24, v8, a2
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v8, v8, v14
+; RV32V-NEXT:    vand.vx v14, v12, a2
 ; RV32V-NEXT:    lui a2, 2048
-; RV32V-NEXT:    vmul.vv v20, v12, v20
-; RV32V-NEXT:    vxor.vv v16, v16, v20
-; RV32V-NEXT:    vand.vx v20, v8, a2
+; RV32V-NEXT:    vmul.vv v16, v10, v16
+; RV32V-NEXT:    vxor.vv v8, v8, v16
+; RV32V-NEXT:    vand.vx v16, v12, a2
 ; RV32V-NEXT:    lui a2, 4096
-; RV32V-NEXT:    vmul.vv v24, v12, v24
-; RV32V-NEXT:    vxor.vv v16, v16, v24
-; RV32V-NEXT:    vand.vx v24, v8, a2
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v8, v8, v14
+; RV32V-NEXT:    vand.vx v14, v12, a2
 ; RV32V-NEXT:    lui a2, 8192
-; RV32V-NEXT:    vmul.vv v20, v12, v20
-; RV32V-NEXT:    vxor.vv v16, v16, v20
-; RV32V-NEXT:    vand.vx v20, v8, a2
+; RV32V-NEXT:    vmul.vv v16, v10, v16
+; RV32V-NEXT:    vxor.vv v8, v8, v16
+; RV32V-NEXT:    vand.vx v16, v12, a2
 ; RV32V-NEXT:    lui a2, 16384
-; RV32V-NEXT:    vmul.vv v24, v12, v24
-; RV32V-NEXT:    vxor.vv v16, v16, v24
-; RV32V-NEXT:    vand.vx v24, v8, a2
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v8, v8, v14
+; RV32V-NEXT:    vand.vx v14, v12, a2
 ; RV32V-NEXT:    lui a2, 32768
-; RV32V-NEXT:    vmul.vv v20, v12, v20
-; RV32V-NEXT:    vxor.vv v16, v16, v20
-; RV32V-NEXT:    vand.vx v20, v8, a2
+; RV32V-NEXT:    vmul.vv v16, v10, v16
+; RV32V-NEXT:    vxor.vv v8, v8, v16
+; RV32V-NEXT:    vand.vx v16, v12, a2
 ; RV32V-NEXT:    lui a2, 65536
-; RV32V-NEXT:    vmul.vv v24, v12, v24
-; RV32V-NEXT:    vxor.vv v16, v16, v24
-; RV32V-NEXT:    vand.vx v24, v8, a2
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v8, v8, v14
+; RV32V-NEXT:    vand.vx v14, v12, a2
 ; RV32V-NEXT:    lui a2, 131072
-; RV32V-NEXT:    vmul.vv v20, v12, v20
-; RV32V-NEXT:    vxor.vv v16, v16, v20
-; RV32V-NEXT:    vand.vx v20, v8, a2
+; RV32V-NEXT:    vmul.vv v16, v10, v16
+; RV32V-NEXT:    vxor.vv v8, v8, v16
+; RV32V-NEXT:    vand.vx v16, v12, a2
 ; RV32V-NEXT:    addi a2, sp, 8
 ; RV32V-NEXT:    sw a0, 8(sp)
 ; RV32V-NEXT:    sw zero, 12(sp)
-; RV32V-NEXT:    vmul.vv v24, v12, v24
-; RV32V-NEXT:    vxor.vv v16, v16, v24
-; RV32V-NEXT:    vlse64.v v24, (a2), zero
-; RV32V-NEXT:    vmul.vv v20, v12, v20
-; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v8, v8, v14
+; RV32V-NEXT:    vlse64.v v14, (a2), zero
+; RV32V-NEXT:    vmul.vv v16, v10, v16
+; RV32V-NEXT:    vxor.vv v8, v8, v16
 ; RV32V-NEXT:    lui a0, 262144
-; RV32V-NEXT:    vand.vv v20, v8, v24
-; RV32V-NEXT:    vand.vx v8, v8, a0
-; RV32V-NEXT:    vmul.vv v8, v12, v8
-; RV32V-NEXT:    vxor.vv v8, v16, v8
-; RV32V-NEXT:    vmul.vv v12, v12, v20
-; RV32V-NEXT:    vxor.vv v12, v8, v12
-; RV32V-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV32V-NEXT:    vnsrl.wx v8, v12, a1
+; RV32V-NEXT:    vand.vv v14, v12, v14
+; RV32V-NEXT:    vand.vx v12, v12, a0
+; RV32V-NEXT:    vmul.vv v12, v10, v12
+; RV32V-NEXT:    vxor.vv v8, v8, v12
+; RV32V-NEXT:    vmul.vv v10, v10, v14
+; RV32V-NEXT:    vxor.vv v10, v8, v10
+; RV32V-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV32V-NEXT:    vnsrl.wx v8, v10, a1
 ; RV32V-NEXT:    addi sp, sp, 16
 ; RV32V-NEXT:    ret
 ;
-; RV64V-LABEL: clmulh_nxv4i32_vx:
+; RV64V-LABEL: clmulh_nxv2i32_vv:
 ; RV64V:       # %bb.0:
-; RV64V-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV64V-NEXT:    vmv.v.x v16, a0
-; RV64V-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
-; RV64V-NEXT:    vzext.vf2 v12, v8
+; RV64V-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; RV64V-NEXT:    vzext.vf2 v10, v8
+; RV64V-NEXT:    vzext.vf2 v12, v9
 ; RV64V-NEXT:    li a1, 16
 ; RV64V-NEXT:    li a0, 32
 ; RV64V-NEXT:    li a2, 64
-; RV64V-NEXT:    vzext.vf2 v8, v16
-; RV64V-NEXT:    vand.vi v16, v8, 2
-; RV64V-NEXT:    vand.vi v20, v8, 1
-; RV64V-NEXT:    vmul.vv v16, v12, v16
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v16, v20, v16
-; RV64V-NEXT:    vand.vi v20, v8, 4
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vi v20, v8, 8
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v8, a1
+; RV64V-NEXT:    vand.vi v8, v12, 2
+; RV64V-NEXT:    vand.vi v14, v12, 1
+; RV64V-NEXT:    vmul.vv v8, v10, v8
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v8, v14, v8
+; RV64V-NEXT:    vand.vi v14, v12, 4
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v8, v8, v14
+; RV64V-NEXT:    vand.vi v14, v12, 8
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v8, v8, v14
+; RV64V-NEXT:    vand.vx v14, v12, a1
 ; RV64V-NEXT:    li a1, 128
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v8, a0
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v8, a2
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v8, v8, v14
+; RV64V-NEXT:    vand.vx v14, v12, a0
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v8, v8, v14
+; RV64V-NEXT:    vand.vx v14, v12, a2
 ; RV64V-NEXT:    li a2, 256
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v8, a1
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v8, v8, v14
+; RV64V-NEXT:    vand.vx v14, v12, a1
 ; RV64V-NEXT:    li a1, 512
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v8, a2
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v8, v8, v14
+; RV64V-NEXT:    vand.vx v14, v12, a2
 ; RV64V-NEXT:    li a2, 1024
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v8, a1
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v8, v8, v14
+; RV64V-NEXT:    vand.vx v14, v12, a1
 ; RV64V-NEXT:    li a1, 1
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v8, a2
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v8, v8, v14
+; RV64V-NEXT:    vand.vx v14, v12, a2
 ; RV64V-NEXT:    slli a2, a1, 11
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v8, a2
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v8, v8, v14
+; RV64V-NEXT:    vand.vx v14, v12, a2
 ; RV64V-NEXT:    lui a2, 1
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v8, a2
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v8, v8, v14
+; RV64V-NEXT:    vand.vx v14, v12, a2
 ; RV64V-NEXT:    lui a2, 2
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v8, a2
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v8, v8, v14
+; RV64V-NEXT:    vand.vx v14, v12, a2
 ; RV64V-NEXT:    lui a2, 4
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v8, a2
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v8, v8, v14
+; RV64V-NEXT:    vand.vx v14, v12, a2
 ; RV64V-NEXT:    lui a2, 8
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v8, a2
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v8, v8, v14
+; RV64V-NEXT:    vand.vx v14, v12, a2
 ; RV64V-NEXT:    lui a2, 16
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v8, a2
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v8, v8, v14
+; RV64V-NEXT:    vand.vx v14, v12, a2
 ; RV64V-NEXT:    lui a2, 32
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v8, a2
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v8, v8, v14
+; RV64V-NEXT:    vand.vx v14, v12, a2
 ; RV64V-NEXT:    lui a2, 64
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v8, a2
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v8, v8, v14
+; RV64V-NEXT:    vand.vx v14, v12, a2
 ; RV64V-NEXT:    lui a2, 128
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v8, a2
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v8, v8, v14
+; RV64V-NEXT:    vand.vx v14, v12, a2
 ; RV64V-NEXT:    lui a2, 256
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v8, a2
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v8, v8, v14
+; RV64V-NEXT:    vand.vx v14, v12, a2
 ; RV64V-NEXT:    lui a2, 512
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v8, a2
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v8, v8, v14
+; RV64V-NEXT:    vand.vx v14, v12, a2
 ; RV64V-NEXT:    lui a2, 1024
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v8, a2
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v8, v8, v14
+; RV64V-NEXT:    vand.vx v14, v12, a2
 ; RV64V-NEXT:    lui a2, 2048
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v8, a2
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v8, v8, v14
+; RV64V-NEXT:    vand.vx v14, v12, a2
 ; RV64V-NEXT:    lui a2, 4096
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v8, a2
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v8, v8, v14
+; RV64V-NEXT:    vand.vx v14, v12, a2
 ; RV64V-NEXT:    lui a2, 8192
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v8, a2
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v8, v8, v14
+; RV64V-NEXT:    vand.vx v14, v12, a2
 ; RV64V-NEXT:    lui a2, 16384
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v8, a2
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v8, v8, v14
+; RV64V-NEXT:    vand.vx v14, v12, a2
 ; RV64V-NEXT:    lui a2, 32768
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v8, a2
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v8, v8, v14
+; RV64V-NEXT:    vand.vx v14, v12, a2
 ; RV64V-NEXT:    lui a2, 65536
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v8, a2
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v8, v8, v14
+; RV64V-NEXT:    vand.vx v14, v12, a2
 ; RV64V-NEXT:    lui a2, 131072
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v8, a2
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v8, v8, v14
+; RV64V-NEXT:    vand.vx v14, v12, a2
 ; RV64V-NEXT:    lui a2, 262144
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v8, v8, v14
+; RV64V-NEXT:    vand.vx v14, v12, a2
 ; RV64V-NEXT:    slli a1, a1, 31
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v8, a2
-; RV64V-NEXT:    vand.vx v8, v8, a1
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vmul.vv v8, v12, v8
-; RV64V-NEXT:    vxor.vv v12, v16, v8
-; RV64V-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV64V-NEXT:    vnsrl.wx v8, v12, a0
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vand.vx v12, v12, a1
+; RV64V-NEXT:    vxor.vv v8, v8, v14
+; RV64V-NEXT:    vmul.vv v10, v10, v12
+; RV64V-NEXT:    vxor.vv v10, v8, v10
+; RV64V-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV64V-NEXT:    vnsrl.wx v8, v10, a0
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC64-LABEL: clmulh_nxv4i32_vx:
+; RV32ZVBC64-LABEL: clmulh_nxv2i32_vv:
 ; RV32ZVBC64:       # %bb.0:
-; RV32ZVBC64-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV32ZVBC64-NEXT:    vmv.v.x v16, a0
-; RV32ZVBC64-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; RV32ZVBC64-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf2 v10, v9
 ; RV32ZVBC64-NEXT:    vzext.vf2 v12, v8
-; RV32ZVBC64-NEXT:    vzext.vf2 v8, v16
-; RV32ZVBC64-NEXT:    vclmul.vv v12, v12, v8
+; RV32ZVBC64-NEXT:    vclmul.vv v10, v12, v10
 ; RV32ZVBC64-NEXT:    li a0, 32
-; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV32ZVBC64-NEXT:    vnsrl.wx v8, v12, a0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wx v8, v10, a0
 ; RV32ZVBC64-NEXT:    ret
 ;
-; RV64ZVBC64-LABEL: clmulh_nxv4i32_vx:
+; RV64ZVBC64-LABEL: clmulh_nxv2i32_vv:
 ; RV64ZVBC64:       # %bb.0:
-; RV64ZVBC64-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV64ZVBC64-NEXT:    vmv.v.x v16, a0
-; RV64ZVBC64-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; RV64ZVBC64-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf2 v10, v9
 ; RV64ZVBC64-NEXT:    vzext.vf2 v12, v8
-; RV64ZVBC64-NEXT:    vzext.vf2 v8, v16
-; RV64ZVBC64-NEXT:    vclmul.vv v12, v12, v8
+; RV64ZVBC64-NEXT:    vclmul.vv v10, v12, v10
 ; RV64ZVBC64-NEXT:    li a0, 32
-; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV64ZVBC64-NEXT:    vnsrl.wx v8, v12, a0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wx v8, v10, a0
 ; RV64ZVBC64-NEXT:    ret
 ;
-; RV32ZVBC32-LABEL: clmulh_nxv4i32_vx:
+; RV32ZVBC32-LABEL: clmulh_nxv2i32_vv:
 ; RV32ZVBC32:       # %bb.0:
-; RV32ZVBC32-NEXT:    addi sp, sp, -16
-; RV32ZVBC32-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV32ZVBC32-NEXT:    vmv.v.x v16, a0
-; RV32ZVBC32-NEXT:    lui a0, 524288
-; RV32ZVBC32-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
-; RV32ZVBC32-NEXT:    vzext.vf2 v12, v8
-; RV32ZVBC32-NEXT:    li a2, 16
-; RV32ZVBC32-NEXT:    li a1, 32
-; RV32ZVBC32-NEXT:    vzext.vf2 v8, v16
-; RV32ZVBC32-NEXT:    vand.vx v16, v8, a2
-; RV32ZVBC32-NEXT:    li a2, 64
-; RV32ZVBC32-NEXT:    vand.vi v20, v8, 2
-; RV32ZVBC32-NEXT:    vand.vi v24, v8, 1
-; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV32ZVBC32-NEXT:    vmul.vv v24, v12, v24
-; RV32ZVBC32-NEXT:    vxor.vv v20, v24, v20
-; RV32ZVBC32-NEXT:    vand.vi v24, v8, 4
-; RV32ZVBC32-NEXT:    vmul.vv v24, v12, v24
-; RV32ZVBC32-NEXT:    vxor.vv v20, v20, v24
-; RV32ZVBC32-NEXT:    vand.vi v24, v8, 8
-; RV32ZVBC32-NEXT:    vmul.vv v24, v12, v24
-; RV32ZVBC32-NEXT:    vxor.vv v20, v20, v24
-; RV32ZVBC32-NEXT:    vand.vx v24, v8, a2
-; RV32ZVBC32-NEXT:    li a2, 128
-; RV32ZVBC32-NEXT:    vmul.vv v16, v12, v16
-; RV32ZVBC32-NEXT:    vxor.vv v16, v20, v16
-; RV32ZVBC32-NEXT:    vand.vx v20, v8, a1
-; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV32ZVBC32-NEXT:    vand.vx v20, v8, a2
-; RV32ZVBC32-NEXT:    li a2, 256
-; RV32ZVBC32-NEXT:    vmul.vv v24, v12, v24
-; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
-; RV32ZVBC32-NEXT:    vand.vx v24, v8, a2
-; RV32ZVBC32-NEXT:    li a2, 512
-; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV32ZVBC32-NEXT:    vand.vx v20, v8, a2
-; RV32ZVBC32-NEXT:    li a2, 1024
-; RV32ZVBC32-NEXT:    vmul.vv v24, v12, v24
-; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
-; RV32ZVBC32-NEXT:    vand.vx v24, v8, a2
-; RV32ZVBC32-NEXT:    li a2, 1
-; RV32ZVBC32-NEXT:    slli a2, a2, 11
-; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV32ZVBC32-NEXT:    vand.vx v20, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 1
-; RV32ZVBC32-NEXT:    vmul.vv v24, v12, v24
-; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
-; RV32ZVBC32-NEXT:    vand.vx v24, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 2
-; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV32ZVBC32-NEXT:    vand.vx v20, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 4
-; RV32ZVBC32-NEXT:    vmul.vv v24, v12, v24
-; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
-; RV32ZVBC32-NEXT:    vand.vx v24, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 8
-; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV32ZVBC32-NEXT:    vand.vx v20, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 16
-; RV32ZVBC32-NEXT:    vmul.vv v24, v12, v24
-; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
-; RV32ZVBC32-NEXT:    vand.vx v24, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 32
-; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV32ZVBC32-NEXT:    vand.vx v20, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 64
-; RV32ZVBC32-NEXT:    vmul.vv v24, v12, v24
-; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
-; RV32ZVBC32-NEXT:    vand.vx v24, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 128
-; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV32ZVBC32-NEXT:    vand.vx v20, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 256
-; RV32ZVBC32-NEXT:    vmul.vv v24, v12, v24
-; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
-; RV32ZVBC32-NEXT:    vand.vx v24, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 512
-; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV32ZVBC32-NEXT:    vand.vx v20, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 1024
-; RV32ZVBC32-NEXT:    vmul.vv v24, v12, v24
-; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
-; RV32ZVBC32-NEXT:    vand.vx v24, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 2048
-; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV32ZVBC32-NEXT:    vand.vx v20, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 4096
-; RV32ZVBC32-NEXT:    vmul.vv v24, v12, v24
-; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
-; RV32ZVBC32-NEXT:    vand.vx v24, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 8192
-; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV32ZVBC32-NEXT:    vand.vx v20, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 16384
-; RV32ZVBC32-NEXT:    vmul.vv v24, v12, v24
-; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
-; RV32ZVBC32-NEXT:    vand.vx v24, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 32768
-; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV32ZVBC32-NEXT:    vand.vx v20, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 65536
-; RV32ZVBC32-NEXT:    vmul.vv v24, v12, v24
-; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
-; RV32ZVBC32-NEXT:    vand.vx v24, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 131072
-; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV32ZVBC32-NEXT:    vand.vx v20, v8, a2
-; RV32ZVBC32-NEXT:    addi a2, sp, 8
-; RV32ZVBC32-NEXT:    sw a0, 8(sp)
-; RV32ZVBC32-NEXT:    sw zero, 12(sp)
-; RV32ZVBC32-NEXT:    vmul.vv v24, v12, v24
-; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v24
-; RV32ZVBC32-NEXT:    vlse64.v v24, (a2), zero
-; RV32ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV32ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV32ZVBC32-NEXT:    lui a0, 262144
-; RV32ZVBC32-NEXT:    vand.vv v20, v8, v24
-; RV32ZVBC32-NEXT:    vand.vx v8, v8, a0
-; RV32ZVBC32-NEXT:    vmul.vv v8, v12, v8
-; RV32ZVBC32-NEXT:    vxor.vv v8, v16, v8
-; RV32ZVBC32-NEXT:    vmul.vv v12, v12, v20
-; RV32ZVBC32-NEXT:    vxor.vv v12, v8, v12
-; RV32ZVBC32-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV32ZVBC32-NEXT:    vnsrl.wx v8, v12, a1
-; RV32ZVBC32-NEXT:    addi sp, sp, 16
+; RV32ZVBC32-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; RV32ZVBC32-NEXT:    vclmulh.vv v8, v8, v9
 ; RV32ZVBC32-NEXT:    ret
 ;
-; RV64ZVBC32-LABEL: clmulh_nxv4i32_vx:
+; RV64ZVBC32-LABEL: clmulh_nxv2i32_vv:
 ; RV64ZVBC32:       # %bb.0:
-; RV64ZVBC32-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV64ZVBC32-NEXT:    vmv.v.x v16, a0
-; RV64ZVBC32-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
-; RV64ZVBC32-NEXT:    vzext.vf2 v12, v8
-; RV64ZVBC32-NEXT:    li a1, 16
-; RV64ZVBC32-NEXT:    li a0, 32
-; RV64ZVBC32-NEXT:    li a2, 64
-; RV64ZVBC32-NEXT:    vzext.vf2 v8, v16
-; RV64ZVBC32-NEXT:    vand.vi v16, v8, 2
-; RV64ZVBC32-NEXT:    vand.vi v20, v8, 1
-; RV64ZVBC32-NEXT:    vmul.vv v16, v12, v16
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v20, v16
-; RV64ZVBC32-NEXT:    vand.vi v20, v8, 4
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vi v20, v8, 8
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v8, a1
-; RV64ZVBC32-NEXT:    li a1, 128
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v8, a0
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v8, a2
-; RV64ZVBC32-NEXT:    li a2, 256
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v8, a1
-; RV64ZVBC32-NEXT:    li a1, 512
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v8, a2
-; RV64ZVBC32-NEXT:    li a2, 1024
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v8, a1
-; RV64ZVBC32-NEXT:    li a1, 1
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v8, a2
-; RV64ZVBC32-NEXT:    slli a2, a1, 11
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 1
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 2
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 4
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 8
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 16
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 32
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 64
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 128
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 256
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 512
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 1024
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 2048
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 4096
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 8192
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 16384
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 32768
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 65536
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 131072
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 262144
-; RV64ZVBC32-NEXT:    slli a1, a1, 31
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vand.vx v20, v8, a2
-; RV64ZVBC32-NEXT:    vand.vx v8, v8, a1
-; RV64ZVBC32-NEXT:    vmul.vv v20, v12, v20
-; RV64ZVBC32-NEXT:    vxor.vv v16, v16, v20
-; RV64ZVBC32-NEXT:    vmul.vv v8, v12, v8
-; RV64ZVBC32-NEXT:    vxor.vv v12, v16, v8
-; RV64ZVBC32-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV64ZVBC32-NEXT:    vnsrl.wx v8, v12, a0
+; RV64ZVBC32-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; RV64ZVBC32-NEXT:    vclmulh.vv v8, v8, v9
 ; RV64ZVBC32-NEXT:    ret
-  %elt.head = insertelement <vscale x 4 x i32> poison, i32 %b, i64 0
-  %vb = shufflevector <vscale x 4 x i32> %elt.head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-  %va.ext = zext <vscale x 4 x i32> %va to <vscale x 4 x i64>
-  %vb.ext = zext <vscale x 4 x i32> %vb to <vscale x 4 x i64>
-  %clmul = call <vscale x 4 x i64> @llvm.clmul.nxv4i64(<vscale x 4 x i64> %va.ext, <vscale x 4 x i64> %vb.ext)
-  %res.ext = lshr <vscale x 4 x i64> %clmul, splat(i64 32)
-  %res = trunc <vscale x 4 x i64> %res.ext to <vscale x 4 x i32>
-  ret <vscale x 4 x i32> %res
+  %va.ext = zext <vscale x 2 x i32> %va to <vscale x 2 x i64>
+  %vb.ext = zext <vscale x 2 x i32> %vb to <vscale x 2 x i64>
+  %clmul = call <vscale x 2 x i64> @llvm.clmul.nxv2i64(<vscale x 2 x i64> %va.ext, <vscale x 2 x i64> %vb.ext)
+  %res.ext = lshr <vscale x 2 x i64> %clmul, splat(i64 32)
+  %res = trunc <vscale x 2 x i64> %res.ext to <vscale x 2 x i32>
+  ret <vscale x 2 x i32> %res
 }
 
-define <vscale x 8 x i32> @clmulh_nxv8i32_vv(<vscale x 8 x i32> %va, <vscale x 8 x i32> %vb) nounwind {
-; RV32V-LABEL: clmulh_nxv8i32_vv:
+define <vscale x 2 x i32> @clmulh_nxv2i32_vx(<vscale x 2 x i32> %va, i32 %b) nounwind {
+; RV32V-LABEL: clmulh_nxv2i32_vx:
 ; RV32V:       # %bb.0:
 ; RV32V-NEXT:    addi sp, sp, -16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    sub sp, sp, a0
+; RV32V-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; RV32V-NEXT:    vmv.v.x v12, a0
 ; RV32V-NEXT:    lui a0, 524288
-; RV32V-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32V-NEXT:    vzext.vf2 v16, v8
-; RV32V-NEXT:    vzext.vf2 v24, v12
+; RV32V-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; RV32V-NEXT:    vzext.vf2 v10, v8
 ; RV32V-NEXT:    li a2, 16
 ; RV32V-NEXT:    li a1, 32
-; RV32V-NEXT:    vand.vi v8, v24, 2
-; RV32V-NEXT:    vand.vi v0, v24, 1
-; RV32V-NEXT:    vmul.vv v8, v16, v8
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v8, v0, v8
-; RV32V-NEXT:    vand.vi v0, v24, 4
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v8, v8, v0
-; RV32V-NEXT:    vand.vi v0, v24, 8
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v8, v8, v0
-; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    vzext.vf2 v8, v12
+; RV32V-NEXT:    vand.vx v12, v8, a2
 ; RV32V-NEXT:    li a2, 64
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v8, v8, v0
-; RV32V-NEXT:    vand.vx v0, v24, a1
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v8, v8, v0
-; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    vand.vi v14, v8, 2
+; RV32V-NEXT:    vand.vi v16, v8, 1
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vmul.vv v16, v10, v16
+; RV32V-NEXT:    vxor.vv v14, v16, v14
+; RV32V-NEXT:    vand.vi v16, v8, 4
+; RV32V-NEXT:    vmul.vv v16, v10, v16
+; RV32V-NEXT:    vxor.vv v14, v14, v16
+; RV32V-NEXT:    vand.vi v16, v8, 8
+; RV32V-NEXT:    vmul.vv v16, v10, v16
+; RV32V-NEXT:    vxor.vv v14, v14, v16
+; RV32V-NEXT:    vand.vx v16, v8, a2
 ; RV32V-NEXT:    li a2, 128
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v8, v8, v0
-; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    vmul.vv v12, v10, v12
+; RV32V-NEXT:    vxor.vv v12, v14, v12
+; RV32V-NEXT:    vand.vx v14, v8, a1
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v8, a2
 ; RV32V-NEXT:    li a2, 256
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v8, v8, v0
-; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    vmul.vv v16, v10, v16
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vand.vx v16, v8, a2
 ; RV32V-NEXT:    li a2, 512
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v8, v8, v0
-; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v8, a2
 ; RV32V-NEXT:    li a2, 1024
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v8, v8, v0
-; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    vmul.vv v16, v10, v16
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vand.vx v16, v8, a2
 ; RV32V-NEXT:    li a2, 1
 ; RV32V-NEXT:    slli a2, a2, 11
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v8, v8, v0
-; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v8, a2
 ; RV32V-NEXT:    lui a2, 1
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v8, v8, v0
-; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    vmul.vv v16, v10, v16
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vand.vx v16, v8, a2
 ; RV32V-NEXT:    lui a2, 2
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v8, v8, v0
-; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v8, a2
 ; RV32V-NEXT:    lui a2, 4
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v8, v8, v0
-; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    vmul.vv v16, v10, v16
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vand.vx v16, v8, a2
 ; RV32V-NEXT:    lui a2, 8
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v8, v8, v0
-; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v8, a2
 ; RV32V-NEXT:    lui a2, 16
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v8, v8, v0
-; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    vmul.vv v16, v10, v16
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vand.vx v16, v8, a2
 ; RV32V-NEXT:    lui a2, 32
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v8, v8, v0
-; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v8, a2
 ; RV32V-NEXT:    lui a2, 64
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v8, v8, v0
-; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    vmul.vv v16, v10, v16
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vand.vx v16, v8, a2
 ; RV32V-NEXT:    lui a2, 128
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v8, v8, v0
-; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v8, a2
 ; RV32V-NEXT:    lui a2, 256
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v8, v8, v0
-; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    vmul.vv v16, v10, v16
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vand.vx v16, v8, a2
 ; RV32V-NEXT:    lui a2, 512
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v8, v8, v0
-; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v8, a2
 ; RV32V-NEXT:    lui a2, 1024
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v8, v8, v0
-; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    vmul.vv v16, v10, v16
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vand.vx v16, v8, a2
 ; RV32V-NEXT:    lui a2, 2048
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v8, v8, v0
-; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v8, a2
 ; RV32V-NEXT:    lui a2, 4096
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v8, v8, v0
-; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    vmul.vv v16, v10, v16
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vand.vx v16, v8, a2
 ; RV32V-NEXT:    lui a2, 8192
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v8, v8, v0
-; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v8, a2
 ; RV32V-NEXT:    lui a2, 16384
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v8, v8, v0
-; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    vmul.vv v16, v10, v16
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vand.vx v16, v8, a2
 ; RV32V-NEXT:    lui a2, 32768
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v8, v8, v0
-; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v8, a2
 ; RV32V-NEXT:    lui a2, 65536
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v8, v8, v0
-; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    vmul.vv v16, v10, v16
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vand.vx v16, v8, a2
 ; RV32V-NEXT:    lui a2, 131072
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v8, v8, v0
-; RV32V-NEXT:    csrr a3, vlenb
-; RV32V-NEXT:    slli a3, a3, 3
-; RV32V-NEXT:    add a3, sp, a3
-; RV32V-NEXT:    addi a3, a3, 16
-; RV32V-NEXT:    vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v8, a2
 ; RV32V-NEXT:    addi a2, sp, 8
 ; RV32V-NEXT:    sw a0, 8(sp)
 ; RV32V-NEXT:    sw zero, 12(sp)
-; RV32V-NEXT:    vlse64.v v8, (a2), zero
-; RV32V-NEXT:    addi a0, sp, 16
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 16
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v0, v8, v0
+; RV32V-NEXT:    vmul.vv v16, v10, v16
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vlse64.v v16, (a2), zero
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
 ; RV32V-NEXT:    lui a0, 262144
-; RV32V-NEXT:    addi a2, sp, 16
-; RV32V-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v8, v24, v8
-; RV32V-NEXT:    vand.vx v24, v24, a0
-; RV32V-NEXT:    vmul.vv v24, v16, v24
-; RV32V-NEXT:    vxor.vv v24, v0, v24
-; RV32V-NEXT:    vmul.vv v8, v16, v8
-; RV32V-NEXT:    vxor.vv v16, v24, v8
-; RV32V-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; RV32V-NEXT:    vnsrl.wx v8, v16, a1
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add sp, sp, a0
+; RV32V-NEXT:    vand.vv v14, v8, v16
+; RV32V-NEXT:    vand.vx v8, v8, a0
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    vxor.vv v8, v12, v8
+; RV32V-NEXT:    vmul.vv v10, v10, v14
+; RV32V-NEXT:    vxor.vv v10, v8, v10
+; RV32V-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV32V-NEXT:    vnsrl.wx v8, v10, a1
 ; RV32V-NEXT:    addi sp, sp, 16
 ; RV32V-NEXT:    ret
 ;
-; RV64V-LABEL: clmulh_nxv8i32_vv:
+; RV64V-LABEL: clmulh_nxv2i32_vx:
 ; RV64V:       # %bb.0:
-; RV64V-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV64V-NEXT:    vzext.vf2 v16, v8
-; RV64V-NEXT:    vzext.vf2 v24, v12
+; RV64V-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; RV64V-NEXT:    vmv.v.x v12, a0
+; RV64V-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; RV64V-NEXT:    vzext.vf2 v10, v8
 ; RV64V-NEXT:    li a1, 16
 ; RV64V-NEXT:    li a0, 32
 ; RV64V-NEXT:    li a2, 64
-; RV64V-NEXT:    vand.vi v8, v24, 2
-; RV64V-NEXT:    vand.vi v0, v24, 1
-; RV64V-NEXT:    vmul.vv v8, v16, v8
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v8, v0, v8
-; RV64V-NEXT:    vand.vi v0, v24, 4
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v8, v8, v0
-; RV64V-NEXT:    vand.vi v0, v24, 8
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v8, v8, v0
-; RV64V-NEXT:    vand.vx v0, v24, a1
+; RV64V-NEXT:    vzext.vf2 v8, v12
+; RV64V-NEXT:    vand.vi v12, v8, 2
+; RV64V-NEXT:    vand.vi v14, v8, 1
+; RV64V-NEXT:    vmul.vv v12, v10, v12
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v12, v14, v12
+; RV64V-NEXT:    vand.vi v14, v8, 4
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vi v14, v8, 8
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v8, a1
 ; RV64V-NEXT:    li a1, 128
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v8, v8, v0
-; RV64V-NEXT:    vand.vx v0, v24, a0
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v8, v8, v0
-; RV64V-NEXT:    vand.vx v0, v24, a2
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v8, a0
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v8, a2
 ; RV64V-NEXT:    li a2, 256
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v8, v8, v0
-; RV64V-NEXT:    vand.vx v0, v24, a1
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v8, a1
 ; RV64V-NEXT:    li a1, 512
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v8, v8, v0
-; RV64V-NEXT:    vand.vx v0, v24, a2
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v8, a2
 ; RV64V-NEXT:    li a2, 1024
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v8, v8, v0
-; RV64V-NEXT:    vand.vx v0, v24, a1
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v8, a1
 ; RV64V-NEXT:    li a1, 1
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v8, v8, v0
-; RV64V-NEXT:    vand.vx v0, v24, a2
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v8, a2
 ; RV64V-NEXT:    slli a2, a1, 11
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v8, v8, v0
-; RV64V-NEXT:    vand.vx v0, v24, a2
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v8, a2
 ; RV64V-NEXT:    lui a2, 1
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v8, v8, v0
-; RV64V-NEXT:    vand.vx v0, v24, a2
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v8, a2
 ; RV64V-NEXT:    lui a2, 2
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v8, v8, v0
-; RV64V-NEXT:    vand.vx v0, v24, a2
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v8, a2
 ; RV64V-NEXT:    lui a2, 4
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v8, v8, v0
-; RV64V-NEXT:    vand.vx v0, v24, a2
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v8, a2
 ; RV64V-NEXT:    lui a2, 8
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v8, v8, v0
-; RV64V-NEXT:    vand.vx v0, v24, a2
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v8, a2
 ; RV64V-NEXT:    lui a2, 16
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v8, v8, v0
-; RV64V-NEXT:    vand.vx v0, v24, a2
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v8, a2
 ; RV64V-NEXT:    lui a2, 32
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v8, v8, v0
-; RV64V-NEXT:    vand.vx v0, v24, a2
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v8, a2
 ; RV64V-NEXT:    lui a2, 64
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v8, v8, v0
-; RV64V-NEXT:    vand.vx v0, v24, a2
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v8, a2
 ; RV64V-NEXT:    lui a2, 128
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v8, v8, v0
-; RV64V-NEXT:    vand.vx v0, v24, a2
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v8, a2
 ; RV64V-NEXT:    lui a2, 256
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v8, v8, v0
-; RV64V-NEXT:    vand.vx v0, v24, a2
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v8, a2
 ; RV64V-NEXT:    lui a2, 512
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v8, v8, v0
-; RV64V-NEXT:    vand.vx v0, v24, a2
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v8, a2
 ; RV64V-NEXT:    lui a2, 1024
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v8, v8, v0
-; RV64V-NEXT:    vand.vx v0, v24, a2
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v8, a2
 ; RV64V-NEXT:    lui a2, 2048
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v8, v8, v0
-; RV64V-NEXT:    vand.vx v0, v24, a2
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v8, a2
 ; RV64V-NEXT:    lui a2, 4096
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v8, v8, v0
-; RV64V-NEXT:    vand.vx v0, v24, a2
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v8, a2
 ; RV64V-NEXT:    lui a2, 8192
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v8, v8, v0
-; RV64V-NEXT:    vand.vx v0, v24, a2
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v8, a2
 ; RV64V-NEXT:    lui a2, 16384
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v8, v8, v0
-; RV64V-NEXT:    vand.vx v0, v24, a2
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v8, a2
 ; RV64V-NEXT:    lui a2, 32768
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v8, v8, v0
-; RV64V-NEXT:    vand.vx v0, v24, a2
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v8, a2
 ; RV64V-NEXT:    lui a2, 65536
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v8, v8, v0
-; RV64V-NEXT:    vand.vx v0, v24, a2
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v8, a2
 ; RV64V-NEXT:    lui a2, 131072
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v8, v8, v0
-; RV64V-NEXT:    vand.vx v0, v24, a2
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v8, a2
 ; RV64V-NEXT:    lui a2, 262144
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v8, v8, v0
-; RV64V-NEXT:    vand.vx v0, v24, a2
 ; RV64V-NEXT:    slli a1, a1, 31
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vand.vx v24, v24, a1
-; RV64V-NEXT:    vxor.vv v8, v8, v0
-; RV64V-NEXT:    vmul.vv v16, v16, v24
-; RV64V-NEXT:    vxor.vv v16, v8, v16
-; RV64V-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; RV64V-NEXT:    vnsrl.wx v8, v16, a0
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v8, a2
+; RV64V-NEXT:    vand.vx v8, v8, a1
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vmul.vv v8, v10, v8
+; RV64V-NEXT:    vxor.vv v10, v12, v8
+; RV64V-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV64V-NEXT:    vnsrl.wx v8, v10, a0
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC64-LABEL: clmulh_nxv8i32_vv:
+; RV32ZVBC64-LABEL: clmulh_nxv2i32_vx:
 ; RV32ZVBC64:       # %bb.0:
-; RV32ZVBC64-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV32ZVBC64-NEXT:    vzext.vf2 v16, v12
-; RV32ZVBC64-NEXT:    vzext.vf2 v24, v8
-; RV32ZVBC64-NEXT:    vclmul.vv v16, v24, v16
-; RV32ZVBC64-NEXT:    li a0, 32
-; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; RV32ZVBC64-NEXT:    vnsrl.wx v8, v16, a0
+; RV32ZVBC64-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; RV32ZVBC64-NEXT:    vmv.v.x v12, a0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf2 v10, v8
+; RV32ZVBC64-NEXT:    vzext.vf2 v8, v12
+; RV32ZVBC64-NEXT:    vclmul.vv v10, v10, v8
+; RV32ZVBC64-NEXT:    li a0, 32
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wx v8, v10, a0
 ; RV32ZVBC64-NEXT:    ret
 ;
-; RV64ZVBC64-LABEL: clmulh_nxv8i32_vv:
+; RV64ZVBC64-LABEL: clmulh_nxv2i32_vx:
 ; RV64ZVBC64:       # %bb.0:
-; RV64ZVBC64-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV64ZVBC64-NEXT:    vzext.vf2 v16, v12
-; RV64ZVBC64-NEXT:    vzext.vf2 v24, v8
-; RV64ZVBC64-NEXT:    vclmul.vv v16, v24, v16
+; RV64ZVBC64-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; RV64ZVBC64-NEXT:    vmv.v.x v12, a0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf2 v10, v8
+; RV64ZVBC64-NEXT:    vzext.vf2 v8, v12
+; RV64ZVBC64-NEXT:    vclmul.vv v10, v10, v8
 ; RV64ZVBC64-NEXT:    li a0, 32
-; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; RV64ZVBC64-NEXT:    vnsrl.wx v8, v16, a0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wx v8, v10, a0
 ; RV64ZVBC64-NEXT:    ret
 ;
-; RV32ZVBC32-LABEL: clmulh_nxv8i32_vv:
+; RV32ZVBC32-LABEL: clmulh_nxv2i32_vx:
 ; RV32ZVBC32:       # %bb.0:
-; RV32ZVBC32-NEXT:    addi sp, sp, -16
-; RV32ZVBC32-NEXT:    csrr a0, vlenb
-; RV32ZVBC32-NEXT:    slli a0, a0, 4
-; RV32ZVBC32-NEXT:    sub sp, sp, a0
-; RV32ZVBC32-NEXT:    lui a0, 524288
-; RV32ZVBC32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32ZVBC32-NEXT:    vzext.vf2 v16, v8
-; RV32ZVBC32-NEXT:    vzext.vf2 v24, v12
-; RV32ZVBC32-NEXT:    li a2, 16
-; RV32ZVBC32-NEXT:    li a1, 32
-; RV32ZVBC32-NEXT:    vand.vi v8, v24, 2
-; RV32ZVBC32-NEXT:    vand.vi v0, v24, 1
-; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
-; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV32ZVBC32-NEXT:    vxor.vv v8, v0, v8
-; RV32ZVBC32-NEXT:    vand.vi v0, v24, 4
-; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v0
-; RV32ZVBC32-NEXT:    vand.vi v0, v24, 8
-; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v0
-; RV32ZVBC32-NEXT:    vand.vx v0, v24, a2
-; RV32ZVBC32-NEXT:    li a2, 64
-; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v0
-; RV32ZVBC32-NEXT:    vand.vx v0, v24, a1
-; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v0
-; RV32ZVBC32-NEXT:    vand.vx v0, v24, a2
-; RV32ZVBC32-NEXT:    li a2, 128
-; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v0
-; RV32ZVBC32-NEXT:    vand.vx v0, v24, a2
-; RV32ZVBC32-NEXT:    li a2, 256
-; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v0
-; RV32ZVBC32-NEXT:    vand.vx v0, v24, a2
-; RV32ZVBC32-NEXT:    li a2, 512
-; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v0
-; RV32ZVBC32-NEXT:    vand.vx v0, v24, a2
-; RV32ZVBC32-NEXT:    li a2, 1024
-; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v0
-; RV32ZVBC32-NEXT:    vand.vx v0, v24, a2
-; RV32ZVBC32-NEXT:    li a2, 1
-; RV32ZVBC32-NEXT:    slli a2, a2, 11
-; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v0
-; RV32ZVBC32-NEXT:    vand.vx v0, v24, a2
-; RV32ZVBC32-NEXT:    lui a2, 1
-; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v0
-; RV32ZVBC32-NEXT:    vand.vx v0, v24, a2
-; RV32ZVBC32-NEXT:    lui a2, 2
-; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v0
-; RV32ZVBC32-NEXT:    vand.vx v0, v24, a2
-; RV32ZVBC32-NEXT:    lui a2, 4
-; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v0
-; RV32ZVBC32-NEXT:    vand.vx v0, v24, a2
-; RV32ZVBC32-NEXT:    lui a2, 8
-; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v0
-; RV32ZVBC32-NEXT:    vand.vx v0, v24, a2
-; RV32ZVBC32-NEXT:    lui a2, 16
-; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v0
-; RV32ZVBC32-NEXT:    vand.vx v0, v24, a2
-; RV32ZVBC32-NEXT:    lui a2, 32
-; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v0
-; RV32ZVBC32-NEXT:    vand.vx v0, v24, a2
-; RV32ZVBC32-NEXT:    lui a2, 64
-; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v0
-; RV32ZVBC32-NEXT:    vand.vx v0, v24, a2
-; RV32ZVBC32-NEXT:    lui a2, 128
-; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v0
-; RV32ZVBC32-NEXT:    vand.vx v0, v24, a2
-; RV32ZVBC32-NEXT:    lui a2, 256
-; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v0
-; RV32ZVBC32-NEXT:    vand.vx v0, v24, a2
-; RV32ZVBC32-NEXT:    lui a2, 512
-; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v0
-; RV32ZVBC32-NEXT:    vand.vx v0, v24, a2
-; RV32ZVBC32-NEXT:    lui a2, 1024
-; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v0
-; RV32ZVBC32-NEXT:    vand.vx v0, v24, a2
-; RV32ZVBC32-NEXT:    lui a2, 2048
-; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v0
-; RV32ZVBC32-NEXT:    vand.vx v0, v24, a2
-; RV32ZVBC32-NEXT:    lui a2, 4096
-; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v0
-; RV32ZVBC32-NEXT:    vand.vx v0, v24, a2
-; RV32ZVBC32-NEXT:    lui a2, 8192
-; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v0
-; RV32ZVBC32-NEXT:    vand.vx v0, v24, a2
-; RV32ZVBC32-NEXT:    lui a2, 16384
-; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v0
-; RV32ZVBC32-NEXT:    vand.vx v0, v24, a2
-; RV32ZVBC32-NEXT:    lui a2, 32768
-; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v0
-; RV32ZVBC32-NEXT:    vand.vx v0, v24, a2
-; RV32ZVBC32-NEXT:    lui a2, 65536
-; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v0
-; RV32ZVBC32-NEXT:    vand.vx v0, v24, a2
-; RV32ZVBC32-NEXT:    lui a2, 131072
-; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV32ZVBC32-NEXT:    vxor.vv v8, v8, v0
-; RV32ZVBC32-NEXT:    csrr a3, vlenb
-; RV32ZVBC32-NEXT:    slli a3, a3, 3
-; RV32ZVBC32-NEXT:    add a3, sp, a3
-; RV32ZVBC32-NEXT:    addi a3, a3, 16
-; RV32ZVBC32-NEXT:    vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
-; RV32ZVBC32-NEXT:    vand.vx v0, v24, a2
-; RV32ZVBC32-NEXT:    addi a2, sp, 8
-; RV32ZVBC32-NEXT:    sw a0, 8(sp)
-; RV32ZVBC32-NEXT:    sw zero, 12(sp)
-; RV32ZVBC32-NEXT:    vlse64.v v8, (a2), zero
-; RV32ZVBC32-NEXT:    addi a0, sp, 16
-; RV32ZVBC32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV32ZVBC32-NEXT:    csrr a0, vlenb
-; RV32ZVBC32-NEXT:    slli a0, a0, 3
-; RV32ZVBC32-NEXT:    add a0, sp, a0
-; RV32ZVBC32-NEXT:    addi a0, a0, 16
-; RV32ZVBC32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32ZVBC32-NEXT:    vxor.vv v0, v8, v0
-; RV32ZVBC32-NEXT:    lui a0, 262144
-; RV32ZVBC32-NEXT:    addi a2, sp, 16
-; RV32ZVBC32-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; RV32ZVBC32-NEXT:    vand.vv v8, v24, v8
-; RV32ZVBC32-NEXT:    vand.vx v24, v24, a0
-; RV32ZVBC32-NEXT:    vmul.vv v24, v16, v24
-; RV32ZVBC32-NEXT:    vxor.vv v24, v0, v24
-; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
-; RV32ZVBC32-NEXT:    vxor.vv v16, v24, v8
-; RV32ZVBC32-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; RV32ZVBC32-NEXT:    vnsrl.wx v8, v16, a1
-; RV32ZVBC32-NEXT:    csrr a0, vlenb
-; RV32ZVBC32-NEXT:    slli a0, a0, 4
-; RV32ZVBC32-NEXT:    add sp, sp, a0
-; RV32ZVBC32-NEXT:    addi sp, sp, 16
+; RV32ZVBC32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; RV32ZVBC32-NEXT:    vclmulh.vx v8, v8, a0
 ; RV32ZVBC32-NEXT:    ret
 ;
-; RV64ZVBC32-LABEL: clmulh_nxv8i32_vv:
+; RV64ZVBC32-LABEL: clmulh_nxv2i32_vx:
 ; RV64ZVBC32:       # %bb.0:
-; RV64ZVBC32-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV64ZVBC32-NEXT:    vzext.vf2 v16, v8
-; RV64ZVBC32-NEXT:    vzext.vf2 v24, v12
-; RV64ZVBC32-NEXT:    li a1, 16
-; RV64ZVBC32-NEXT:    li a0, 32
-; RV64ZVBC32-NEXT:    li a2, 64
-; RV64ZVBC32-NEXT:    vand.vi v8, v24, 2
-; RV64ZVBC32-NEXT:    vand.vi v0, v24, 1
-; RV64ZVBC32-NEXT:    vmul.vv v8, v16, v8
-; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV64ZVBC32-NEXT:    vxor.vv v8, v0, v8
-; RV64ZVBC32-NEXT:    vand.vi v0, v24, 4
-; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v0
-; RV64ZVBC32-NEXT:    vand.vi v0, v24, 8
-; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v0
-; RV64ZVBC32-NEXT:    vand.vx v0, v24, a1
-; RV64ZVBC32-NEXT:    li a1, 128
-; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v0
-; RV64ZVBC32-NEXT:    vand.vx v0, v24, a0
-; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v0
-; RV64ZVBC32-NEXT:    vand.vx v0, v24, a2
-; RV64ZVBC32-NEXT:    li a2, 256
-; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v0
-; RV64ZVBC32-NEXT:    vand.vx v0, v24, a1
-; RV64ZVBC32-NEXT:    li a1, 512
-; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v0
-; RV64ZVBC32-NEXT:    vand.vx v0, v24, a2
-; RV64ZVBC32-NEXT:    li a2, 1024
-; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v0
-; RV64ZVBC32-NEXT:    vand.vx v0, v24, a1
-; RV64ZVBC32-NEXT:    li a1, 1
-; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v0
-; RV64ZVBC32-NEXT:    vand.vx v0, v24, a2
-; RV64ZVBC32-NEXT:    slli a2, a1, 11
-; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v0
-; RV64ZVBC32-NEXT:    vand.vx v0, v24, a2
-; RV64ZVBC32-NEXT:    lui a2, 1
-; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v0
-; RV64ZVBC32-NEXT:    vand.vx v0, v24, a2
-; RV64ZVBC32-NEXT:    lui a2, 2
-; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v0
-; RV64ZVBC32-NEXT:    vand.vx v0, v24, a2
-; RV64ZVBC32-NEXT:    lui a2, 4
-; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v0
-; RV64ZVBC32-NEXT:    vand.vx v0, v24, a2
-; RV64ZVBC32-NEXT:    lui a2, 8
-; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v0
-; RV64ZVBC32-NEXT:    vand.vx v0, v24, a2
-; RV64ZVBC32-NEXT:    lui a2, 16
-; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v0
-; RV64ZVBC32-NEXT:    vand.vx v0, v24, a2
-; RV64ZVBC32-NEXT:    lui a2, 32
-; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v0
-; RV64ZVBC32-NEXT:    vand.vx v0, v24, a2
-; RV64ZVBC32-NEXT:    lui a2, 64
-; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v0
-; RV64ZVBC32-NEXT:    vand.vx v0, v24, a2
-; RV64ZVBC32-NEXT:    lui a2, 128
-; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v0
-; RV64ZVBC32-NEXT:    vand.vx v0, v24, a2
-; RV64ZVBC32-NEXT:    lui a2, 256
-; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v0
-; RV64ZVBC32-NEXT:    vand.vx v0, v24, a2
-; RV64ZVBC32-NEXT:    lui a2, 512
-; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v0
-; RV64ZVBC32-NEXT:    vand.vx v0, v24, a2
-; RV64ZVBC32-NEXT:    lui a2, 1024
-; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v0
-; RV64ZVBC32-NEXT:    vand.vx v0, v24, a2
-; RV64ZVBC32-NEXT:    lui a2, 2048
-; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v0
-; RV64ZVBC32-NEXT:    vand.vx v0, v24, a2
-; RV64ZVBC32-NEXT:    lui a2, 4096
-; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v0
-; RV64ZVBC32-NEXT:    vand.vx v0, v24, a2
-; RV64ZVBC32-NEXT:    lui a2, 8192
-; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v0
-; RV64ZVBC32-NEXT:    vand.vx v0, v24, a2
-; RV64ZVBC32-NEXT:    lui a2, 16384
-; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v0
-; RV64ZVBC32-NEXT:    vand.vx v0, v24, a2
-; RV64ZVBC32-NEXT:    lui a2, 32768
-; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v0
-; RV64ZVBC32-NEXT:    vand.vx v0, v24, a2
-; RV64ZVBC32-NEXT:    lui a2, 65536
-; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v0
-; RV64ZVBC32-NEXT:    vand.vx v0, v24, a2
-; RV64ZVBC32-NEXT:    lui a2, 131072
-; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v0
-; RV64ZVBC32-NEXT:    vand.vx v0, v24, a2
-; RV64ZVBC32-NEXT:    lui a2, 262144
-; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v0
-; RV64ZVBC32-NEXT:    vand.vx v0, v24, a2
-; RV64ZVBC32-NEXT:    slli a1, a1, 31
-; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV64ZVBC32-NEXT:    vand.vx v24, v24, a1
-; RV64ZVBC32-NEXT:    vxor.vv v8, v8, v0
-; RV64ZVBC32-NEXT:    vmul.vv v16, v16, v24
-; RV64ZVBC32-NEXT:    vxor.vv v16, v8, v16
-; RV64ZVBC32-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; RV64ZVBC32-NEXT:    vnsrl.wx v8, v16, a0
+; RV64ZVBC32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; RV64ZVBC32-NEXT:    vclmulh.vx v8, v8, a0
 ; RV64ZVBC32-NEXT:    ret
-  %va.ext = zext <vscale x 8 x i32> %va to <vscale x 8 x i64>
-  %vb.ext = zext <vscale x 8 x i32> %vb to <vscale x 8 x i64>
-  %clmul = call <vscale x 8 x i64> @llvm.clmul.nxv8i64(<vscale x 8 x i64> %va.ext, <vscale x 8 x i64> %vb.ext)
-  %res.ext = lshr <vscale x 8 x i64> %clmul, splat(i64 32)
-  %res = trunc <vscale x 8 x i64> %res.ext to <vscale x 8 x i32>
-  ret <vscale x 8 x i32> %res
+  %elt.head = insertelement <vscale x 2 x i32> poison, i32 %b, i64 0
+  %vb = shufflevector <vscale x 2 x i32> %elt.head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+  %va.ext = zext <vscale x 2 x i32> %va to <vscale x 2 x i64>
+  %vb.ext = zext <vscale x 2 x i32> %vb to <vscale x 2 x i64>
+  %clmul = call <vscale x 2 x i64> @llvm.clmul.nxv2i64(<vscale x 2 x i64> %va.ext, <vscale x 2 x i64> %vb.ext)
+  %res.ext = lshr <vscale x 2 x i64> %clmul, splat(i64 32)
+  %res = trunc <vscale x 2 x i64> %res.ext to <vscale x 2 x i32>
+  ret <vscale x 2 x i32> %res
 }
 
-define <vscale x 8 x i32> @clmulh_nxv8i32_vx(<vscale x 8 x i32> %va, i32 %b) nounwind {
-; RV32V-LABEL: clmulh_nxv8i32_vx:
+define <vscale x 4 x i32> @clmulh_nxv4i32_vv(<vscale x 4 x i32> %va, <vscale x 4 x i32> %vb) nounwind {
+; RV32V-LABEL: clmulh_nxv4i32_vv:
 ; RV32V:       # %bb.0:
 ; RV32V-NEXT:    addi sp, sp, -16
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a1, a1, 4
-; RV32V-NEXT:    sub sp, sp, a1
-; RV32V-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV32V-NEXT:    vmv.v.x v24, a0
 ; RV32V-NEXT:    lui a0, 524288
-; RV32V-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV32V-NEXT:    vzext.vf2 v16, v8
+; RV32V-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32V-NEXT:    vzext.vf2 v12, v8
+; RV32V-NEXT:    vzext.vf2 v16, v10
 ; RV32V-NEXT:    li a2, 16
 ; RV32V-NEXT:    li a1, 32
-; RV32V-NEXT:    vzext.vf2 v8, v24
-; RV32V-NEXT:    vand.vi v24, v8, 2
-; RV32V-NEXT:    vand.vi v0, v8, 1
-; RV32V-NEXT:    vmul.vv v24, v16, v24
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v24, v0, v24
-; RV32V-NEXT:    vand.vi v0, v8, 4
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v24, v24, v0
-; RV32V-NEXT:    vand.vi v0, v8, 8
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v24, v24, v0
-; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    vand.vx v8, v16, a2
 ; RV32V-NEXT:    li a2, 64
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v24, v24, v0
-; RV32V-NEXT:    vand.vx v0, v8, a1
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v24, v24, v0
-; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    vand.vi v20, v16, 2
+; RV32V-NEXT:    vand.vi v24, v16, 1
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vmul.vv v24, v12, v24
+; RV32V-NEXT:    vxor.vv v20, v24, v20
+; RV32V-NEXT:    vand.vi v24, v16, 4
+; RV32V-NEXT:    vmul.vv v24, v12, v24
+; RV32V-NEXT:    vxor.vv v20, v20, v24
+; RV32V-NEXT:    vand.vi v24, v16, 8
+; RV32V-NEXT:    vmul.vv v24, v12, v24
+; RV32V-NEXT:    vxor.vv v20, v20, v24
+; RV32V-NEXT:    vand.vx v24, v16, a2
 ; RV32V-NEXT:    li a2, 128
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v24, v24, v0
-; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    vmul.vv v8, v12, v8
+; RV32V-NEXT:    vxor.vv v8, v20, v8
+; RV32V-NEXT:    vand.vx v20, v16, a1
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v8, v8, v20
+; RV32V-NEXT:    vand.vx v20, v16, a2
 ; RV32V-NEXT:    li a2, 256
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v24, v24, v0
-; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    vmul.vv v24, v12, v24
+; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vand.vx v24, v16, a2
 ; RV32V-NEXT:    li a2, 512
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v24, v24, v0
-; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v8, v8, v20
+; RV32V-NEXT:    vand.vx v20, v16, a2
 ; RV32V-NEXT:    li a2, 1024
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v24, v24, v0
-; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    vmul.vv v24, v12, v24
+; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vand.vx v24, v16, a2
+; RV32V-NEXT:    lui a2, 1
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v8, v8, v20
+; RV32V-NEXT:    vand.vx v20, v16, a2
 ; RV32V-NEXT:    li a2, 1
 ; RV32V-NEXT:    slli a2, a2, 11
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v24, v24, v0
-; RV32V-NEXT:    vand.vx v0, v8, a2
-; RV32V-NEXT:    lui a2, 1
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v24, v24, v0
-; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    vmul.vv v24, v12, v24
+; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vand.vx v24, v16, a2
 ; RV32V-NEXT:    lui a2, 2
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v24, v24, v0
-; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    vmul.vv v24, v12, v24
+; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vand.vx v24, v16, a2
 ; RV32V-NEXT:    lui a2, 4
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v24, v24, v0
-; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v8, v8, v20
+; RV32V-NEXT:    vand.vx v20, v16, a2
 ; RV32V-NEXT:    lui a2, 8
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v24, v24, v0
-; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    vmul.vv v24, v12, v24
+; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vand.vx v24, v16, a2
 ; RV32V-NEXT:    lui a2, 16
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v24, v24, v0
-; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v8, v8, v20
+; RV32V-NEXT:    vand.vx v20, v16, a2
 ; RV32V-NEXT:    lui a2, 32
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v24, v24, v0
-; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    vmul.vv v24, v12, v24
+; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vand.vx v24, v16, a2
 ; RV32V-NEXT:    lui a2, 64
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v24, v24, v0
-; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v8, v8, v20
+; RV32V-NEXT:    vand.vx v20, v16, a2
 ; RV32V-NEXT:    lui a2, 128
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v24, v24, v0
-; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    vmul.vv v24, v12, v24
+; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vand.vx v24, v16, a2
 ; RV32V-NEXT:    lui a2, 256
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v24, v24, v0
-; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v8, v8, v20
+; RV32V-NEXT:    vand.vx v20, v16, a2
 ; RV32V-NEXT:    lui a2, 512
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v24, v24, v0
-; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    vmul.vv v24, v12, v24
+; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vand.vx v24, v16, a2
 ; RV32V-NEXT:    lui a2, 1024
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v24, v24, v0
-; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v8, v8, v20
+; RV32V-NEXT:    vand.vx v20, v16, a2
 ; RV32V-NEXT:    lui a2, 2048
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v24, v24, v0
-; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    vmul.vv v24, v12, v24
+; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vand.vx v24, v16, a2
 ; RV32V-NEXT:    lui a2, 4096
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v24, v24, v0
-; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v8, v8, v20
+; RV32V-NEXT:    vand.vx v20, v16, a2
 ; RV32V-NEXT:    lui a2, 8192
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v24, v24, v0
-; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    vmul.vv v24, v12, v24
+; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vand.vx v24, v16, a2
 ; RV32V-NEXT:    lui a2, 16384
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v24, v24, v0
-; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v8, v8, v20
+; RV32V-NEXT:    vand.vx v20, v16, a2
 ; RV32V-NEXT:    lui a2, 32768
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v24, v24, v0
-; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    vmul.vv v24, v12, v24
+; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vand.vx v24, v16, a2
 ; RV32V-NEXT:    lui a2, 65536
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v24, v24, v0
-; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v8, v8, v20
+; RV32V-NEXT:    vand.vx v20, v16, a2
 ; RV32V-NEXT:    lui a2, 131072
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v24, v24, v0
-; RV32V-NEXT:    csrr a3, vlenb
-; RV32V-NEXT:    slli a3, a3, 3
-; RV32V-NEXT:    add a3, sp, a3
-; RV32V-NEXT:    addi a3, a3, 16
-; RV32V-NEXT:    vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    vmul.vv v24, v12, v24
+; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vand.vx v24, v16, a2
 ; RV32V-NEXT:    addi a2, sp, 8
 ; RV32V-NEXT:    sw a0, 8(sp)
 ; RV32V-NEXT:    sw zero, 12(sp)
-; RV32V-NEXT:    vlse64.v v24, (a2), zero
-; RV32V-NEXT:    addi a0, sp, 16
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 16
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v0, v24, v0
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v8, v8, v20
+; RV32V-NEXT:    vlse64.v v20, (a2), zero
+; RV32V-NEXT:    vmul.vv v24, v12, v24
+; RV32V-NEXT:    vxor.vv v8, v8, v24
 ; RV32V-NEXT:    lui a0, 262144
-; RV32V-NEXT:    addi a2, sp, 16
-; RV32V-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v8, v24
-; RV32V-NEXT:    vand.vx v8, v8, a0
-; RV32V-NEXT:    vmul.vv v8, v16, v8
-; RV32V-NEXT:    vxor.vv v8, v0, v8
-; RV32V-NEXT:    vmul.vv v16, v16, v24
-; RV32V-NEXT:    vxor.vv v16, v8, v16
-; RV32V-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; RV32V-NEXT:    vnsrl.wx v8, v16, a1
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add sp, sp, a0
+; RV32V-NEXT:    vand.vv v20, v16, v20
+; RV32V-NEXT:    vand.vx v16, v16, a0
+; RV32V-NEXT:    vmul.vv v16, v12, v16
+; RV32V-NEXT:    vxor.vv v8, v8, v16
+; RV32V-NEXT:    vmul.vv v12, v12, v20
+; RV32V-NEXT:    vxor.vv v12, v8, v12
+; RV32V-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32V-NEXT:    vnsrl.wx v8, v12, a1
 ; RV32V-NEXT:    addi sp, sp, 16
 ; RV32V-NEXT:    ret
 ;
-; RV64V-LABEL: clmulh_nxv8i32_vx:
+; RV64V-LABEL: clmulh_nxv4i32_vv:
 ; RV64V:       # %bb.0:
-; RV64V-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV64V-NEXT:    vmv.v.x v24, a0
-; RV64V-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV64V-NEXT:    vzext.vf2 v16, v8
+; RV64V-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV64V-NEXT:    vzext.vf2 v12, v8
+; RV64V-NEXT:    vzext.vf2 v16, v10
 ; RV64V-NEXT:    li a1, 16
 ; RV64V-NEXT:    li a0, 32
 ; RV64V-NEXT:    li a2, 64
-; RV64V-NEXT:    vzext.vf2 v8, v24
-; RV64V-NEXT:    vand.vi v24, v8, 2
-; RV64V-NEXT:    vand.vi v0, v8, 1
-; RV64V-NEXT:    vmul.vv v24, v16, v24
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v24, v0, v24
-; RV64V-NEXT:    vand.vi v0, v8, 4
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    vand.vi v0, v8, 8
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    vand.vx v0, v8, a1
+; RV64V-NEXT:    vand.vi v8, v16, 2
+; RV64V-NEXT:    vand.vi v20, v16, 1
+; RV64V-NEXT:    vmul.vv v8, v12, v8
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v20, v8
+; RV64V-NEXT:    vand.vi v20, v16, 4
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vand.vi v20, v16, 8
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vand.vx v20, v16, a1
 ; RV64V-NEXT:    li a1, 128
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    vand.vx v0, v8, a0
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    vand.vx v0, v8, a2
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vand.vx v20, v16, a0
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vand.vx v20, v16, a2
 ; RV64V-NEXT:    li a2, 256
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    vand.vx v0, v8, a1
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vand.vx v20, v16, a1
 ; RV64V-NEXT:    li a1, 512
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    vand.vx v0, v8, a2
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vand.vx v20, v16, a2
 ; RV64V-NEXT:    li a2, 1024
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    vand.vx v0, v8, a1
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vand.vx v20, v16, a1
 ; RV64V-NEXT:    li a1, 1
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    vand.vx v0, v8, a2
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vand.vx v20, v16, a2
 ; RV64V-NEXT:    slli a2, a1, 11
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    vand.vx v0, v8, a2
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vand.vx v20, v16, a2
 ; RV64V-NEXT:    lui a2, 1
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    vand.vx v0, v8, a2
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vand.vx v20, v16, a2
 ; RV64V-NEXT:    lui a2, 2
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    vand.vx v0, v8, a2
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vand.vx v20, v16, a2
 ; RV64V-NEXT:    lui a2, 4
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    vand.vx v0, v8, a2
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vand.vx v20, v16, a2
 ; RV64V-NEXT:    lui a2, 8
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    vand.vx v0, v8, a2
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vand.vx v20, v16, a2
 ; RV64V-NEXT:    lui a2, 16
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    vand.vx v0, v8, a2
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vand.vx v20, v16, a2
 ; RV64V-NEXT:    lui a2, 32
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    vand.vx v0, v8, a2
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vand.vx v20, v16, a2
 ; RV64V-NEXT:    lui a2, 64
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    vand.vx v0, v8, a2
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vand.vx v20, v16, a2
 ; RV64V-NEXT:    lui a2, 128
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    vand.vx v0, v8, a2
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vand.vx v20, v16, a2
 ; RV64V-NEXT:    lui a2, 256
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    vand.vx v0, v8, a2
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vand.vx v20, v16, a2
 ; RV64V-NEXT:    lui a2, 512
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    vand.vx v0, v8, a2
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vand.vx v20, v16, a2
 ; RV64V-NEXT:    lui a2, 1024
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    vand.vx v0, v8, a2
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vand.vx v20, v16, a2
 ; RV64V-NEXT:    lui a2, 2048
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    vand.vx v0, v8, a2
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vand.vx v20, v16, a2
 ; RV64V-NEXT:    lui a2, 4096
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    vand.vx v0, v8, a2
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vand.vx v20, v16, a2
 ; RV64V-NEXT:    lui a2, 8192
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    vand.vx v0, v8, a2
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vand.vx v20, v16, a2
 ; RV64V-NEXT:    lui a2, 16384
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    vand.vx v0, v8, a2
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vand.vx v20, v16, a2
 ; RV64V-NEXT:    lui a2, 32768
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    vand.vx v0, v8, a2
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vand.vx v20, v16, a2
 ; RV64V-NEXT:    lui a2, 65536
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    vand.vx v0, v8, a2
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vand.vx v20, v16, a2
 ; RV64V-NEXT:    lui a2, 131072
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    vand.vx v0, v8, a2
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vand.vx v20, v16, a2
 ; RV64V-NEXT:    lui a2, 262144
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vand.vx v20, v16, a2
 ; RV64V-NEXT:    slli a1, a1, 31
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    vand.vx v0, v8, a2
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vand.vx v16, v16, a1
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vmul.vv v12, v12, v16
+; RV64V-NEXT:    vxor.vv v12, v8, v12
+; RV64V-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV64V-NEXT:    vnsrl.wx v8, v12, a0
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC64-LABEL: clmulh_nxv4i32_vv:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf2 v12, v10
+; RV32ZVBC64-NEXT:    vzext.vf2 v16, v8
+; RV32ZVBC64-NEXT:    vclmul.vv v12, v16, v12
+; RV32ZVBC64-NEXT:    li a0, 32
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wx v8, v12, a0
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmulh_nxv4i32_vv:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf2 v12, v10
+; RV64ZVBC64-NEXT:    vzext.vf2 v16, v8
+; RV64ZVBC64-NEXT:    vclmul.vv v12, v16, v12
+; RV64ZVBC64-NEXT:    li a0, 32
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wx v8, v12, a0
+; RV64ZVBC64-NEXT:    ret
+;
+; RV32ZVBC32-LABEL: clmulh_nxv4i32_vv:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; RV32ZVBC32-NEXT:    vclmulh.vv v8, v8, v10
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmulh_nxv4i32_vv:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; RV64ZVBC32-NEXT:    vclmulh.vv v8, v8, v10
+; RV64ZVBC32-NEXT:    ret
+  %va.ext = zext <vscale x 4 x i32> %va to <vscale x 4 x i64>
+  %vb.ext = zext <vscale x 4 x i32> %vb to <vscale x 4 x i64>
+  %clmul = call <vscale x 4 x i64> @llvm.clmul.nxv4i64(<vscale x 4 x i64> %va.ext, <vscale x 4 x i64> %vb.ext)
+  %res.ext = lshr <vscale x 4 x i64> %clmul, splat(i64 32)
+  %res = trunc <vscale x 4 x i64> %res.ext to <vscale x 4 x i32>
+  ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 4 x i32> @clmulh_nxv4i32_vx(<vscale x 4 x i32> %va, i32 %b) nounwind {
+; RV32V-LABEL: clmulh_nxv4i32_vx:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    addi sp, sp, -16
+; RV32V-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; RV32V-NEXT:    vmv.v.x v16, a0
+; RV32V-NEXT:    lui a0, 524288
+; RV32V-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; RV32V-NEXT:    vzext.vf2 v12, v8
+; RV32V-NEXT:    li a2, 16
+; RV32V-NEXT:    li a1, 32
+; RV32V-NEXT:    vzext.vf2 v8, v16
+; RV32V-NEXT:    vand.vx v16, v8, a2
+; RV32V-NEXT:    li a2, 64
+; RV32V-NEXT:    vand.vi v20, v8, 2
+; RV32V-NEXT:    vand.vi v24, v8, 1
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vmul.vv v24, v12, v24
+; RV32V-NEXT:    vxor.vv v20, v24, v20
+; RV32V-NEXT:    vand.vi v24, v8, 4
+; RV32V-NEXT:    vmul.vv v24, v12, v24
+; RV32V-NEXT:    vxor.vv v20, v20, v24
+; RV32V-NEXT:    vand.vi v24, v8, 8
+; RV32V-NEXT:    vmul.vv v24, v12, v24
+; RV32V-NEXT:    vxor.vv v20, v20, v24
+; RV32V-NEXT:    vand.vx v24, v8, a2
+; RV32V-NEXT:    li a2, 128
+; RV32V-NEXT:    vmul.vv v16, v12, v16
+; RV32V-NEXT:    vxor.vv v16, v20, v16
+; RV32V-NEXT:    vand.vx v20, v8, a1
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v8, a2
+; RV32V-NEXT:    li a2, 256
+; RV32V-NEXT:    vmul.vv v24, v12, v24
+; RV32V-NEXT:    vxor.vv v16, v16, v24
+; RV32V-NEXT:    vand.vx v24, v8, a2
+; RV32V-NEXT:    li a2, 512
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v8, a2
+; RV32V-NEXT:    li a2, 1024
+; RV32V-NEXT:    vmul.vv v24, v12, v24
+; RV32V-NEXT:    vxor.vv v16, v16, v24
+; RV32V-NEXT:    vand.vx v24, v8, a2
+; RV32V-NEXT:    li a2, 1
+; RV32V-NEXT:    slli a2, a2, 11
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v8, a2
+; RV32V-NEXT:    lui a2, 1
+; RV32V-NEXT:    vmul.vv v24, v12, v24
+; RV32V-NEXT:    vxor.vv v16, v16, v24
+; RV32V-NEXT:    vand.vx v24, v8, a2
+; RV32V-NEXT:    lui a2, 2
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v8, a2
+; RV32V-NEXT:    lui a2, 4
+; RV32V-NEXT:    vmul.vv v24, v12, v24
+; RV32V-NEXT:    vxor.vv v16, v16, v24
+; RV32V-NEXT:    vand.vx v24, v8, a2
+; RV32V-NEXT:    lui a2, 8
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v8, a2
+; RV32V-NEXT:    lui a2, 16
+; RV32V-NEXT:    vmul.vv v24, v12, v24
+; RV32V-NEXT:    vxor.vv v16, v16, v24
+; RV32V-NEXT:    vand.vx v24, v8, a2
+; RV32V-NEXT:    lui a2, 32
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v8, a2
+; RV32V-NEXT:    lui a2, 64
+; RV32V-NEXT:    vmul.vv v24, v12, v24
+; RV32V-NEXT:    vxor.vv v16, v16, v24
+; RV32V-NEXT:    vand.vx v24, v8, a2
+; RV32V-NEXT:    lui a2, 128
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v8, a2
+; RV32V-NEXT:    lui a2, 256
+; RV32V-NEXT:    vmul.vv v24, v12, v24
+; RV32V-NEXT:    vxor.vv v16, v16, v24
+; RV32V-NEXT:    vand.vx v24, v8, a2
+; RV32V-NEXT:    lui a2, 512
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v8, a2
+; RV32V-NEXT:    lui a2, 1024
+; RV32V-NEXT:    vmul.vv v24, v12, v24
+; RV32V-NEXT:    vxor.vv v16, v16, v24
+; RV32V-NEXT:    vand.vx v24, v8, a2
+; RV32V-NEXT:    lui a2, 2048
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v8, a2
+; RV32V-NEXT:    lui a2, 4096
+; RV32V-NEXT:    vmul.vv v24, v12, v24
+; RV32V-NEXT:    vxor.vv v16, v16, v24
+; RV32V-NEXT:    vand.vx v24, v8, a2
+; RV32V-NEXT:    lui a2, 8192
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v8, a2
+; RV32V-NEXT:    lui a2, 16384
+; RV32V-NEXT:    vmul.vv v24, v12, v24
+; RV32V-NEXT:    vxor.vv v16, v16, v24
+; RV32V-NEXT:    vand.vx v24, v8, a2
+; RV32V-NEXT:    lui a2, 32768
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v8, a2
+; RV32V-NEXT:    lui a2, 65536
+; RV32V-NEXT:    vmul.vv v24, v12, v24
+; RV32V-NEXT:    vxor.vv v16, v16, v24
+; RV32V-NEXT:    vand.vx v24, v8, a2
+; RV32V-NEXT:    lui a2, 131072
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v8, a2
+; RV32V-NEXT:    addi a2, sp, 8
+; RV32V-NEXT:    sw a0, 8(sp)
+; RV32V-NEXT:    sw zero, 12(sp)
+; RV32V-NEXT:    vmul.vv v24, v12, v24
+; RV32V-NEXT:    vxor.vv v16, v16, v24
+; RV32V-NEXT:    vlse64.v v24, (a2), zero
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    lui a0, 262144
+; RV32V-NEXT:    vand.vv v20, v8, v24
+; RV32V-NEXT:    vand.vx v8, v8, a0
+; RV32V-NEXT:    vmul.vv v8, v12, v8
+; RV32V-NEXT:    vxor.vv v8, v16, v8
+; RV32V-NEXT:    vmul.vv v12, v12, v20
+; RV32V-NEXT:    vxor.vv v12, v8, v12
+; RV32V-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32V-NEXT:    vnsrl.wx v8, v12, a1
+; RV32V-NEXT:    addi sp, sp, 16
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: clmulh_nxv4i32_vx:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; RV64V-NEXT:    vmv.v.x v16, a0
+; RV64V-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; RV64V-NEXT:    vzext.vf2 v12, v8
+; RV64V-NEXT:    li a1, 16
+; RV64V-NEXT:    li a0, 32
+; RV64V-NEXT:    li a2, 64
+; RV64V-NEXT:    vzext.vf2 v8, v16
+; RV64V-NEXT:    vand.vi v16, v8, 2
+; RV64V-NEXT:    vand.vi v20, v8, 1
+; RV64V-NEXT:    vmul.vv v16, v12, v16
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v16, v20, v16
+; RV64V-NEXT:    vand.vi v20, v8, 4
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vi v20, v8, 8
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v8, a1
+; RV64V-NEXT:    li a1, 128
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v8, a0
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v8, a2
+; RV64V-NEXT:    li a2, 256
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v8, a1
+; RV64V-NEXT:    li a1, 512
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v8, a2
+; RV64V-NEXT:    li a2, 1024
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v8, a1
+; RV64V-NEXT:    li a1, 1
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v8, a2
+; RV64V-NEXT:    slli a2, a1, 11
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v8, a2
+; RV64V-NEXT:    lui a2, 1
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v8, a2
+; RV64V-NEXT:    lui a2, 2
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v8, a2
+; RV64V-NEXT:    lui a2, 4
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v8, a2
+; RV64V-NEXT:    lui a2, 8
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v8, a2
+; RV64V-NEXT:    lui a2, 16
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v8, a2
+; RV64V-NEXT:    lui a2, 32
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v8, a2
+; RV64V-NEXT:    lui a2, 64
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v8, a2
+; RV64V-NEXT:    lui a2, 128
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v8, a2
+; RV64V-NEXT:    lui a2, 256
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v8, a2
+; RV64V-NEXT:    lui a2, 512
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v8, a2
+; RV64V-NEXT:    lui a2, 1024
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v8, a2
+; RV64V-NEXT:    lui a2, 2048
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v8, a2
+; RV64V-NEXT:    lui a2, 4096
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v8, a2
+; RV64V-NEXT:    lui a2, 8192
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v8, a2
+; RV64V-NEXT:    lui a2, 16384
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v8, a2
+; RV64V-NEXT:    lui a2, 32768
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v8, a2
+; RV64V-NEXT:    lui a2, 65536
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v8, a2
+; RV64V-NEXT:    lui a2, 131072
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v8, a2
+; RV64V-NEXT:    lui a2, 262144
+; RV64V-NEXT:    slli a1, a1, 31
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v8, a2
+; RV64V-NEXT:    vand.vx v8, v8, a1
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vmul.vv v8, v12, v8
+; RV64V-NEXT:    vxor.vv v12, v16, v8
+; RV64V-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV64V-NEXT:    vnsrl.wx v8, v12, a0
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC64-LABEL: clmulh_nxv4i32_vx:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; RV32ZVBC64-NEXT:    vmv.v.x v16, a0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf2 v12, v8
+; RV32ZVBC64-NEXT:    vzext.vf2 v8, v16
+; RV32ZVBC64-NEXT:    vclmul.vv v12, v12, v8
+; RV32ZVBC64-NEXT:    li a0, 32
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wx v8, v12, a0
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmulh_nxv4i32_vx:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; RV64ZVBC64-NEXT:    vmv.v.x v16, a0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf2 v12, v8
+; RV64ZVBC64-NEXT:    vzext.vf2 v8, v16
+; RV64ZVBC64-NEXT:    vclmul.vv v12, v12, v8
+; RV64ZVBC64-NEXT:    li a0, 32
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wx v8, v12, a0
+; RV64ZVBC64-NEXT:    ret
+;
+; RV32ZVBC32-LABEL: clmulh_nxv4i32_vx:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; RV32ZVBC32-NEXT:    vclmulh.vx v8, v8, a0
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmulh_nxv4i32_vx:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; RV64ZVBC32-NEXT:    vclmulh.vx v8, v8, a0
+; RV64ZVBC32-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i32> poison, i32 %b, i64 0
+  %vb = shufflevector <vscale x 4 x i32> %elt.head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %va.ext = zext <vscale x 4 x i32> %va to <vscale x 4 x i64>
+  %vb.ext = zext <vscale x 4 x i32> %vb to <vscale x 4 x i64>
+  %clmul = call <vscale x 4 x i64> @llvm.clmul.nxv4i64(<vscale x 4 x i64> %va.ext, <vscale x 4 x i64> %vb.ext)
+  %res.ext = lshr <vscale x 4 x i64> %clmul, splat(i64 32)
+  %res = trunc <vscale x 4 x i64> %res.ext to <vscale x 4 x i32>
+  ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 8 x i32> @clmulh_nxv8i32_vv(<vscale x 8 x i32> %va, <vscale x 8 x i32> %vb) nounwind {
+; RV32V-LABEL: clmulh_nxv8i32_vv:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    addi sp, sp, -16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    sub sp, sp, a0
+; RV32V-NEXT:    lui a0, 524288
+; RV32V-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32V-NEXT:    vzext.vf2 v16, v8
+; RV32V-NEXT:    vzext.vf2 v24, v12
+; RV32V-NEXT:    li a2, 16
+; RV32V-NEXT:    li a1, 32
+; RV32V-NEXT:    vand.vi v8, v24, 2
+; RV32V-NEXT:    vand.vi v0, v24, 1
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v0, v8
+; RV32V-NEXT:    vand.vi v0, v24, 4
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    vand.vi v0, v24, 8
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    li a2, 64
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    vand.vx v0, v24, a1
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    li a2, 128
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    li a2, 256
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    li a2, 512
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    li a2, 1024
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    li a2, 1
+; RV32V-NEXT:    slli a2, a2, 11
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    lui a2, 1
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    lui a2, 2
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    lui a2, 4
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    lui a2, 8
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    lui a2, 16
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    lui a2, 32
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    lui a2, 64
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    lui a2, 128
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    lui a2, 256
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    lui a2, 512
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    lui a2, 1024
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    lui a2, 2048
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    lui a2, 4096
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    lui a2, 8192
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    lui a2, 16384
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    lui a2, 32768
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    lui a2, 65536
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    lui a2, 131072
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 3
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    addi a2, sp, 8
+; RV32V-NEXT:    sw a0, 8(sp)
+; RV32V-NEXT:    sw zero, 12(sp)
+; RV32V-NEXT:    vlse64.v v8, (a2), zero
+; RV32V-NEXT:    addi a0, sp, 16
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 16
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v0, v8, v0
+; RV32V-NEXT:    lui a0, 262144
+; RV32V-NEXT:    addi a2, sp, 16
+; RV32V-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vv v8, v24, v8
+; RV32V-NEXT:    vand.vx v24, v24, a0
+; RV32V-NEXT:    vmul.vv v24, v16, v24
+; RV32V-NEXT:    vxor.vv v24, v0, v24
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    vxor.vv v16, v24, v8
+; RV32V-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV32V-NEXT:    vnsrl.wx v8, v16, a1
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add sp, sp, a0
+; RV32V-NEXT:    addi sp, sp, 16
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: clmulh_nxv8i32_vv:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV64V-NEXT:    vzext.vf2 v16, v8
+; RV64V-NEXT:    vzext.vf2 v24, v12
+; RV64V-NEXT:    li a1, 16
+; RV64V-NEXT:    li a0, 32
+; RV64V-NEXT:    li a2, 64
+; RV64V-NEXT:    vand.vi v8, v24, 2
+; RV64V-NEXT:    vand.vi v0, v24, 1
+; RV64V-NEXT:    vmul.vv v8, v16, v8
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v0, v8
+; RV64V-NEXT:    vand.vi v0, v24, 4
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vand.vi v0, v24, 8
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vand.vx v0, v24, a1
+; RV64V-NEXT:    li a1, 128
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vand.vx v0, v24, a0
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vand.vx v0, v24, a2
+; RV64V-NEXT:    li a2, 256
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vand.vx v0, v24, a1
+; RV64V-NEXT:    li a1, 512
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vand.vx v0, v24, a2
+; RV64V-NEXT:    li a2, 1024
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vand.vx v0, v24, a1
+; RV64V-NEXT:    li a1, 1
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vand.vx v0, v24, a2
+; RV64V-NEXT:    slli a2, a1, 11
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vand.vx v0, v24, a2
+; RV64V-NEXT:    lui a2, 1
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vand.vx v0, v24, a2
+; RV64V-NEXT:    lui a2, 2
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vand.vx v0, v24, a2
+; RV64V-NEXT:    lui a2, 4
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vand.vx v0, v24, a2
+; RV64V-NEXT:    lui a2, 8
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vand.vx v0, v24, a2
+; RV64V-NEXT:    lui a2, 16
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vand.vx v0, v24, a2
+; RV64V-NEXT:    lui a2, 32
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vand.vx v0, v24, a2
+; RV64V-NEXT:    lui a2, 64
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vand.vx v0, v24, a2
+; RV64V-NEXT:    lui a2, 128
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vand.vx v0, v24, a2
+; RV64V-NEXT:    lui a2, 256
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vand.vx v0, v24, a2
+; RV64V-NEXT:    lui a2, 512
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vand.vx v0, v24, a2
+; RV64V-NEXT:    lui a2, 1024
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vand.vx v0, v24, a2
+; RV64V-NEXT:    lui a2, 2048
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vand.vx v0, v24, a2
+; RV64V-NEXT:    lui a2, 4096
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vand.vx v0, v24, a2
+; RV64V-NEXT:    lui a2, 8192
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vand.vx v0, v24, a2
+; RV64V-NEXT:    lui a2, 16384
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vand.vx v0, v24, a2
+; RV64V-NEXT:    lui a2, 32768
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vand.vx v0, v24, a2
+; RV64V-NEXT:    lui a2, 65536
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vand.vx v0, v24, a2
+; RV64V-NEXT:    lui a2, 131072
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vand.vx v0, v24, a2
+; RV64V-NEXT:    lui a2, 262144
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vand.vx v0, v24, a2
+; RV64V-NEXT:    slli a1, a1, 31
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vand.vx v24, v24, a1
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vmul.vv v16, v16, v24
+; RV64V-NEXT:    vxor.vv v16, v8, v16
+; RV64V-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV64V-NEXT:    vnsrl.wx v8, v16, a0
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC64-LABEL: clmulh_nxv8i32_vv:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf2 v16, v12
+; RV32ZVBC64-NEXT:    vzext.vf2 v24, v8
+; RV32ZVBC64-NEXT:    vclmul.vv v16, v24, v16
+; RV32ZVBC64-NEXT:    li a0, 32
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wx v8, v16, a0
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmulh_nxv8i32_vv:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf2 v16, v12
+; RV64ZVBC64-NEXT:    vzext.vf2 v24, v8
+; RV64ZVBC64-NEXT:    vclmul.vv v16, v24, v16
+; RV64ZVBC64-NEXT:    li a0, 32
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wx v8, v16, a0
+; RV64ZVBC64-NEXT:    ret
+;
+; RV32ZVBC32-LABEL: clmulh_nxv8i32_vv:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; RV32ZVBC32-NEXT:    vclmulh.vv v8, v8, v12
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmulh_nxv8i32_vv:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; RV64ZVBC32-NEXT:    vclmulh.vv v8, v8, v12
+; RV64ZVBC32-NEXT:    ret
+  %va.ext = zext <vscale x 8 x i32> %va to <vscale x 8 x i64>
+  %vb.ext = zext <vscale x 8 x i32> %vb to <vscale x 8 x i64>
+  %clmul = call <vscale x 8 x i64> @llvm.clmul.nxv8i64(<vscale x 8 x i64> %va.ext, <vscale x 8 x i64> %vb.ext)
+  %res.ext = lshr <vscale x 8 x i64> %clmul, splat(i64 32)
+  %res = trunc <vscale x 8 x i64> %res.ext to <vscale x 8 x i32>
+  ret <vscale x 8 x i32> %res
+}
+
+define <vscale x 8 x i32> @clmulh_nxv8i32_vx(<vscale x 8 x i32> %va, i32 %b) nounwind {
+; RV32V-LABEL: clmulh_nxv8i32_vx:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    addi sp, sp, -16
+; RV32V-NEXT:    csrr a1, vlenb
+; RV32V-NEXT:    slli a1, a1, 4
+; RV32V-NEXT:    sub sp, sp, a1
+; RV32V-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
+; RV32V-NEXT:    vmv.v.x v24, a0
+; RV32V-NEXT:    lui a0, 524288
+; RV32V-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV32V-NEXT:    vzext.vf2 v16, v8
+; RV32V-NEXT:    li a2, 16
+; RV32V-NEXT:    li a1, 32
+; RV32V-NEXT:    vzext.vf2 v8, v24
+; RV32V-NEXT:    vand.vi v24, v8, 2
+; RV32V-NEXT:    vand.vi v0, v8, 1
+; RV32V-NEXT:    vmul.vv v24, v16, v24
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v24, v0, v24
+; RV32V-NEXT:    vand.vi v0, v8, 4
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vi v0, v8, 8
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    li a2, 64
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v8, a1
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    li a2, 128
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    li a2, 256
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    li a2, 512
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    li a2, 1024
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    li a2, 1
+; RV32V-NEXT:    slli a2, a2, 11
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    lui a2, 1
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    lui a2, 2
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    lui a2, 4
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    lui a2, 8
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    lui a2, 16
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    lui a2, 32
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    lui a2, 64
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    lui a2, 128
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    lui a2, 256
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    lui a2, 512
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    lui a2, 1024
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    lui a2, 2048
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    lui a2, 4096
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    lui a2, 8192
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    lui a2, 16384
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    lui a2, 32768
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    lui a2, 65536
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    lui a2, 131072
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 3
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    addi a2, sp, 8
+; RV32V-NEXT:    sw a0, 8(sp)
+; RV32V-NEXT:    sw zero, 12(sp)
+; RV32V-NEXT:    vlse64.v v24, (a2), zero
+; RV32V-NEXT:    addi a0, sp, 16
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 16
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v0, v24, v0
+; RV32V-NEXT:    lui a0, 262144
+; RV32V-NEXT:    addi a2, sp, 16
+; RV32V-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vv v24, v8, v24
+; RV32V-NEXT:    vand.vx v8, v8, a0
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    vxor.vv v8, v0, v8
+; RV32V-NEXT:    vmul.vv v16, v16, v24
+; RV32V-NEXT:    vxor.vv v16, v8, v16
+; RV32V-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV32V-NEXT:    vnsrl.wx v8, v16, a1
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add sp, sp, a0
+; RV32V-NEXT:    addi sp, sp, 16
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: clmulh_nxv8i32_vx:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
+; RV64V-NEXT:    vmv.v.x v24, a0
+; RV64V-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV64V-NEXT:    vzext.vf2 v16, v8
+; RV64V-NEXT:    li a1, 16
+; RV64V-NEXT:    li a0, 32
+; RV64V-NEXT:    li a2, 64
+; RV64V-NEXT:    vzext.vf2 v8, v24
+; RV64V-NEXT:    vand.vi v24, v8, 2
+; RV64V-NEXT:    vand.vi v0, v8, 1
+; RV64V-NEXT:    vmul.vv v24, v16, v24
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v24, v0, v24
+; RV64V-NEXT:    vand.vi v0, v8, 4
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vi v0, v8, 8
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v8, a1
+; RV64V-NEXT:    li a1, 128
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v8, a0
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v8, a2
+; RV64V-NEXT:    li a2, 256
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v8, a1
+; RV64V-NEXT:    li a1, 512
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v8, a2
+; RV64V-NEXT:    li a2, 1024
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v8, a1
+; RV64V-NEXT:    li a1, 1
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v8, a2
+; RV64V-NEXT:    slli a2, a1, 11
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v8, a2
+; RV64V-NEXT:    lui a2, 1
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v8, a2
+; RV64V-NEXT:    lui a2, 2
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v8, a2
+; RV64V-NEXT:    lui a2, 4
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v8, a2
+; RV64V-NEXT:    lui a2, 8
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v8, a2
+; RV64V-NEXT:    lui a2, 16
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v8, a2
+; RV64V-NEXT:    lui a2, 32
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v8, a2
+; RV64V-NEXT:    lui a2, 64
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v8, a2
+; RV64V-NEXT:    lui a2, 128
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v8, a2
+; RV64V-NEXT:    lui a2, 256
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v8, a2
+; RV64V-NEXT:    lui a2, 512
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v8, a2
+; RV64V-NEXT:    lui a2, 1024
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v8, a2
+; RV64V-NEXT:    lui a2, 2048
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v8, a2
+; RV64V-NEXT:    lui a2, 4096
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v8, a2
+; RV64V-NEXT:    lui a2, 8192
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v8, a2
+; RV64V-NEXT:    lui a2, 16384
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v8, a2
+; RV64V-NEXT:    lui a2, 32768
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v8, a2
+; RV64V-NEXT:    lui a2, 65536
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v8, a2
+; RV64V-NEXT:    lui a2, 131072
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v8, a2
+; RV64V-NEXT:    lui a2, 262144
+; RV64V-NEXT:    slli a1, a1, 31
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v8, a2
+; RV64V-NEXT:    vand.vx v8, v8, a1
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vmul.vv v8, v16, v8
+; RV64V-NEXT:    vxor.vv v16, v24, v8
+; RV64V-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV64V-NEXT:    vnsrl.wx v8, v16, a0
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC64-LABEL: clmulh_nxv8i32_vx:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
+; RV32ZVBC64-NEXT:    vmv.v.x v24, a0
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV32ZVBC64-NEXT:    vzext.vf2 v16, v8
+; RV32ZVBC64-NEXT:    vzext.vf2 v8, v24
+; RV32ZVBC64-NEXT:    vclmul.vv v16, v16, v8
+; RV32ZVBC64-NEXT:    li a0, 32
+; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV32ZVBC64-NEXT:    vnsrl.wx v8, v16, a0
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmulh_nxv8i32_vx:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
+; RV64ZVBC64-NEXT:    vmv.v.x v24, a0
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV64ZVBC64-NEXT:    vzext.vf2 v16, v8
+; RV64ZVBC64-NEXT:    vzext.vf2 v8, v24
+; RV64ZVBC64-NEXT:    vclmul.vv v16, v16, v8
+; RV64ZVBC64-NEXT:    li a0, 32
+; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV64ZVBC64-NEXT:    vnsrl.wx v8, v16, a0
+; RV64ZVBC64-NEXT:    ret
+;
+; RV32ZVBC32-LABEL: clmulh_nxv8i32_vx:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
+; RV32ZVBC32-NEXT:    vclmulh.vx v8, v8, a0
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmulh_nxv8i32_vx:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
+; RV64ZVBC32-NEXT:    vclmulh.vx v8, v8, a0
+; RV64ZVBC32-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i32> poison, i32 %b, i64 0
+  %vb = shufflevector <vscale x 8 x i32> %elt.head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+  %va.ext = zext <vscale x 8 x i32> %va to <vscale x 8 x i64>
+  %vb.ext = zext <vscale x 8 x i32> %vb to <vscale x 8 x i64>
+  %clmul = call <vscale x 8 x i64> @llvm.clmul.nxv8i64(<vscale x 8 x i64> %va.ext, <vscale x 8 x i64> %vb.ext)
+  %res.ext = lshr <vscale x 8 x i64> %clmul, splat(i64 32)
+  %res = trunc <vscale x 8 x i64> %res.ext to <vscale x 8 x i32>
+  ret <vscale x 8 x i32> %res
+}
+
+define <vscale x 16 x i32> @clmulh_nxv16i32_vv(<vscale x 16 x i32> %va, <vscale x 16 x i32> %vb) nounwind {
+; RV32V-LABEL: clmulh_nxv16i32_vv:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    addi sp, sp, -80
+; RV32V-NEXT:    sw ra, 76(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s0, 72(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s1, 68(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s2, 64(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s3, 60(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s4, 56(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s5, 52(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s6, 48(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s7, 44(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s8, 40(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s9, 36(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s10, 32(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s11, 28(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    sub sp, sp, a0
+; RV32V-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; RV32V-NEXT:    vsrl.vi v24, v8, 8
+; RV32V-NEXT:    lui a0, 16
+; RV32V-NEXT:    vsrl.vi v0, v8, 24
+; RV32V-NEXT:    addi a0, a0, -256
+; RV32V-NEXT:    vand.vx v24, v24, a0
+; RV32V-NEXT:    vor.vv v24, v24, v0
+; RV32V-NEXT:    csrr a1, vlenb
+; RV32V-NEXT:    slli a1, a1, 3
+; RV32V-NEXT:    mv a2, a1
+; RV32V-NEXT:    slli a1, a1, 1
+; RV32V-NEXT:    add a2, a2, a1
+; RV32V-NEXT:    slli a1, a1, 4
+; RV32V-NEXT:    add a1, a1, a2
+; RV32V-NEXT:    add a1, sp, a1
+; RV32V-NEXT:    addi a1, a1, 16
+; RV32V-NEXT:    vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vsrl.vi v0, v16, 8
+; RV32V-NEXT:    vand.vx v0, v0, a0
+; RV32V-NEXT:    vsrl.vi v24, v16, 24
+; RV32V-NEXT:    vor.vv v0, v0, v24
+; RV32V-NEXT:    vsll.vi v24, v8, 24
+; RV32V-NEXT:    vand.vx v8, v8, a0
+; RV32V-NEXT:    vsll.vi v8, v8, 8
+; RV32V-NEXT:    vor.vv v8, v24, v8
+; RV32V-NEXT:    vsll.vi v24, v16, 24
+; RV32V-NEXT:    vand.vx v16, v16, a0
+; RV32V-NEXT:    vsll.vi v16, v16, 8
+; RV32V-NEXT:    vor.vv v16, v24, v16
+; RV32V-NEXT:    lui a1, 61681
+; RV32V-NEXT:    lui a2, 209715
+; RV32V-NEXT:    lui t6, 349525
+; RV32V-NEXT:    li s5, 16
+; RV32V-NEXT:    li t2, 32
+; RV32V-NEXT:    li a7, 256
+; RV32V-NEXT:    li t0, 512
+; RV32V-NEXT:    li t1, 1024
+; RV32V-NEXT:    li s6, 1
+; RV32V-NEXT:    lui t3, 1
+; RV32V-NEXT:    lui t4, 2
+; RV32V-NEXT:    lui t5, 4
+; RV32V-NEXT:    lui s0, 8
+; RV32V-NEXT:    lui s1, 32
+; RV32V-NEXT:    lui s2, 64
+; RV32V-NEXT:    lui s3, 128
+; RV32V-NEXT:    lui s4, 256
+; RV32V-NEXT:    lui s7, 512
+; RV32V-NEXT:    lui s8, 1024
+; RV32V-NEXT:    lui s9, 2048
+; RV32V-NEXT:    lui s10, 4096
+; RV32V-NEXT:    lui s11, 8192
+; RV32V-NEXT:    lui ra, 16384
+; RV32V-NEXT:    lui a5, 32768
+; RV32V-NEXT:    addi a3, a1, -241
+; RV32V-NEXT:    addi a2, a2, 819
+; RV32V-NEXT:    addi a1, t6, 1365
+; RV32V-NEXT:    vor.vv v16, v16, v0
+; RV32V-NEXT:    vsrl.vi v24, v16, 4
+; RV32V-NEXT:    vand.vx v16, v16, a3
+; RV32V-NEXT:    vand.vx v24, v24, a3
+; RV32V-NEXT:    vsll.vi v16, v16, 4
+; RV32V-NEXT:    vor.vv v16, v24, v16
+; RV32V-NEXT:    vsrl.vi v24, v16, 2
+; RV32V-NEXT:    vand.vx v16, v16, a2
+; RV32V-NEXT:    vand.vx v24, v24, a2
+; RV32V-NEXT:    vsll.vi v16, v16, 2
+; RV32V-NEXT:    vor.vv v16, v24, v16
+; RV32V-NEXT:    vsrl.vi v24, v16, 1
+; RV32V-NEXT:    vand.vx v16, v16, a1
+; RV32V-NEXT:    vand.vx v24, v24, a1
+; RV32V-NEXT:    vadd.vv v16, v16, v16
+; RV32V-NEXT:    vor.vv v0, v24, v16
+; RV32V-NEXT:    vand.vx v16, v0, s5
+; RV32V-NEXT:    sw a0, 4(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a4, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a4
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 16
+; RV32V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    lui t6, 65536
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a4, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a4, a4, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a4
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 16
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vor.vv v8, v8, v16
+; RV32V-NEXT:    vsrl.vi v16, v8, 4
+; RV32V-NEXT:    vand.vx v8, v8, a3
+; RV32V-NEXT:    vand.vx v16, v16, a3
+; RV32V-NEXT:    vsll.vi v8, v8, 4
+; RV32V-NEXT:    vor.vv v8, v16, v8
+; RV32V-NEXT:    vsrl.vi v16, v8, 2
+; RV32V-NEXT:    vand.vx v8, v8, a2
+; RV32V-NEXT:    vand.vx v16, v16, a2
+; RV32V-NEXT:    vsll.vi v8, v8, 2
+; RV32V-NEXT:    vor.vv v8, v16, v8
+; RV32V-NEXT:    vsrl.vi v16, v8, 1
+; RV32V-NEXT:    vand.vx v8, v8, a1
+; RV32V-NEXT:    vand.vx v16, v16, a1
+; RV32V-NEXT:    vadd.vv v8, v8, v8
+; RV32V-NEXT:    vor.vv v8, v16, v8
+; RV32V-NEXT:    vand.vx v16, v0, t2
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a4, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a4, a4, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a4
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 16
+; RV32V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    lui a4, 131072
+; RV32V-NEXT:    slli s6, s6, 11
+; RV32V-NEXT:    lui t2, 262144
+; RV32V-NEXT:    lui s5, 524288
+; RV32V-NEXT:    li a6, 64
+; RV32V-NEXT:    vand.vx v16, v0, a6
+; RV32V-NEXT:    csrr a6, vlenb
+; RV32V-NEXT:    slli a6, a6, 3
+; RV32V-NEXT:    mv a0, a6
+; RV32V-NEXT:    slli a6, a6, 5
+; RV32V-NEXT:    add a6, a6, a0
+; RV32V-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    add a6, sp, a6
+; RV32V-NEXT:    addi a6, a6, 16
+; RV32V-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    li a6, 128
+; RV32V-NEXT:    vand.vx v16, v0, a6
+; RV32V-NEXT:    csrr a6, vlenb
+; RV32V-NEXT:    slli a6, a6, 8
+; RV32V-NEXT:    add a6, sp, a6
+; RV32V-NEXT:    addi a6, a6, 16
+; RV32V-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v16, v0, a7
+; RV32V-NEXT:    csrr a6, vlenb
+; RV32V-NEXT:    slli a6, a6, 3
+; RV32V-NEXT:    mv a7, a6
+; RV32V-NEXT:    slli a6, a6, 1
+; RV32V-NEXT:    add a7, a7, a6
+; RV32V-NEXT:    slli a6, a6, 1
+; RV32V-NEXT:    add a7, a7, a6
+; RV32V-NEXT:    slli a6, a6, 1
+; RV32V-NEXT:    add a7, a7, a6
+; RV32V-NEXT:    slli a6, a6, 1
+; RV32V-NEXT:    add a6, a6, a7
+; RV32V-NEXT:    add a6, sp, a6
+; RV32V-NEXT:    addi a6, a6, 16
+; RV32V-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v16, v0, t0
+; RV32V-NEXT:    csrr a6, vlenb
+; RV32V-NEXT:    slli a6, a6, 4
+; RV32V-NEXT:    mv a7, a6
+; RV32V-NEXT:    slli a6, a6, 1
+; RV32V-NEXT:    add a7, a7, a6
+; RV32V-NEXT:    slli a6, a6, 1
+; RV32V-NEXT:    add a7, a7, a6
+; RV32V-NEXT:    slli a6, a6, 1
+; RV32V-NEXT:    add a6, a6, a7
+; RV32V-NEXT:    add a6, sp, a6
+; RV32V-NEXT:    addi a6, a6, 16
+; RV32V-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v16, v0, t1
+; RV32V-NEXT:    csrr a6, vlenb
+; RV32V-NEXT:    slli a6, a6, 3
+; RV32V-NEXT:    mv a7, a6
+; RV32V-NEXT:    slli a6, a6, 2
+; RV32V-NEXT:    add a7, a7, a6
+; RV32V-NEXT:    slli a6, a6, 1
+; RV32V-NEXT:    add a7, a7, a6
+; RV32V-NEXT:    slli a6, a6, 1
+; RV32V-NEXT:    add a6, a6, a7
+; RV32V-NEXT:    add a6, sp, a6
+; RV32V-NEXT:    addi a6, a6, 16
+; RV32V-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v16, v0, s6
+; RV32V-NEXT:    csrr a6, vlenb
+; RV32V-NEXT:    slli a6, a6, 5
+; RV32V-NEXT:    mv a7, a6
+; RV32V-NEXT:    slli a6, a6, 1
+; RV32V-NEXT:    add a7, a7, a6
+; RV32V-NEXT:    slli a6, a6, 1
+; RV32V-NEXT:    add a6, a6, a7
+; RV32V-NEXT:    add a6, sp, a6
+; RV32V-NEXT:    addi a6, a6, 16
+; RV32V-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v16, v0, t3
+; RV32V-NEXT:    csrr a6, vlenb
+; RV32V-NEXT:    slli a6, a6, 3
+; RV32V-NEXT:    mv a7, a6
+; RV32V-NEXT:    slli a6, a6, 1
+; RV32V-NEXT:    add a7, a7, a6
+; RV32V-NEXT:    slli a6, a6, 2
+; RV32V-NEXT:    add a7, a7, a6
+; RV32V-NEXT:    slli a6, a6, 1
+; RV32V-NEXT:    add a6, a6, a7
+; RV32V-NEXT:    add a6, sp, a6
+; RV32V-NEXT:    addi a6, a6, 16
+; RV32V-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v16, v0, t4
+; RV32V-NEXT:    csrr a6, vlenb
+; RV32V-NEXT:    slli a6, a6, 4
+; RV32V-NEXT:    mv a7, a6
+; RV32V-NEXT:    slli a6, a6, 2
+; RV32V-NEXT:    add a7, a7, a6
+; RV32V-NEXT:    slli a6, a6, 1
+; RV32V-NEXT:    add a6, a6, a7
+; RV32V-NEXT:    add a6, sp, a6
+; RV32V-NEXT:    addi a6, a6, 16
+; RV32V-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v16, v0, t5
+; RV32V-NEXT:    csrr a6, vlenb
+; RV32V-NEXT:    slli a6, a6, 3
+; RV32V-NEXT:    mv a7, a6
+; RV32V-NEXT:    slli a6, a6, 3
+; RV32V-NEXT:    add a7, a7, a6
+; RV32V-NEXT:    slli a6, a6, 1
+; RV32V-NEXT:    add a6, a6, a7
+; RV32V-NEXT:    add a6, sp, a6
+; RV32V-NEXT:    addi a6, a6, 16
+; RV32V-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v16, v0, s0
+; RV32V-NEXT:    csrr a6, vlenb
+; RV32V-NEXT:    slli a6, a6, 6
+; RV32V-NEXT:    mv a7, a6
+; RV32V-NEXT:    slli a6, a6, 1
+; RV32V-NEXT:    add a6, a6, a7
+; RV32V-NEXT:    add a6, sp, a6
+; RV32V-NEXT:    addi a6, a6, 16
+; RV32V-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    lui a6, 16
+; RV32V-NEXT:    vand.vx v16, v0, a6
+; RV32V-NEXT:    csrr a6, vlenb
+; RV32V-NEXT:    slli a6, a6, 3
+; RV32V-NEXT:    mv a7, a6
+; RV32V-NEXT:    slli a6, a6, 1
+; RV32V-NEXT:    add a7, a7, a6
+; RV32V-NEXT:    slli a6, a6, 1
+; RV32V-NEXT:    add a7, a7, a6
+; RV32V-NEXT:    slli a6, a6, 2
+; RV32V-NEXT:    add a6, a6, a7
+; RV32V-NEXT:    add a6, sp, a6
+; RV32V-NEXT:    addi a6, a6, 16
+; RV32V-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v16, v0, s1
+; RV32V-NEXT:    csrr a6, vlenb
+; RV32V-NEXT:    slli a6, a6, 4
+; RV32V-NEXT:    mv a7, a6
+; RV32V-NEXT:    slli a6, a6, 1
+; RV32V-NEXT:    add a7, a7, a6
+; RV32V-NEXT:    slli a6, a6, 2
+; RV32V-NEXT:    add a6, a6, a7
+; RV32V-NEXT:    add a6, sp, a6
+; RV32V-NEXT:    addi a6, a6, 16
+; RV32V-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v16, v0, s2
+; RV32V-NEXT:    csrr a6, vlenb
+; RV32V-NEXT:    slli a6, a6, 3
+; RV32V-NEXT:    mv a7, a6
+; RV32V-NEXT:    slli a6, a6, 2
+; RV32V-NEXT:    add a7, a7, a6
+; RV32V-NEXT:    slli a6, a6, 2
+; RV32V-NEXT:    add a6, a6, a7
+; RV32V-NEXT:    add a6, sp, a6
+; RV32V-NEXT:    addi a6, a6, 16
+; RV32V-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v16, v0, s3
+; RV32V-NEXT:    csrr a6, vlenb
+; RV32V-NEXT:    slli a6, a6, 5
+; RV32V-NEXT:    mv a7, a6
+; RV32V-NEXT:    slli a6, a6, 2
+; RV32V-NEXT:    add a6, a6, a7
+; RV32V-NEXT:    add a6, sp, a6
+; RV32V-NEXT:    addi a6, a6, 16
+; RV32V-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v16, v0, s4
+; RV32V-NEXT:    csrr a6, vlenb
+; RV32V-NEXT:    slli a6, a6, 3
+; RV32V-NEXT:    mv a7, a6
+; RV32V-NEXT:    slli a6, a6, 1
+; RV32V-NEXT:    add a7, a7, a6
+; RV32V-NEXT:    slli a6, a6, 3
+; RV32V-NEXT:    add a6, a6, a7
+; RV32V-NEXT:    add a6, sp, a6
+; RV32V-NEXT:    addi a6, a6, 16
+; RV32V-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v16, v0, s7
+; RV32V-NEXT:    csrr a6, vlenb
+; RV32V-NEXT:    slli a6, a6, 4
+; RV32V-NEXT:    mv a7, a6
+; RV32V-NEXT:    slli a6, a6, 3
+; RV32V-NEXT:    add a6, a6, a7
+; RV32V-NEXT:    add a6, sp, a6
+; RV32V-NEXT:    addi a6, a6, 16
+; RV32V-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v16, v0, s8
+; RV32V-NEXT:    csrr a6, vlenb
+; RV32V-NEXT:    slli a6, a6, 3
+; RV32V-NEXT:    mv a7, a6
+; RV32V-NEXT:    slli a6, a6, 4
+; RV32V-NEXT:    add a6, a6, a7
+; RV32V-NEXT:    add a6, sp, a6
+; RV32V-NEXT:    addi a6, a6, 16
+; RV32V-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v16, v0, s9
+; RV32V-NEXT:    csrr a6, vlenb
+; RV32V-NEXT:    slli a6, a6, 7
+; RV32V-NEXT:    add a6, sp, a6
+; RV32V-NEXT:    addi a6, a6, 16
+; RV32V-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v16, v0, s10
+; RV32V-NEXT:    csrr a6, vlenb
+; RV32V-NEXT:    slli a6, a6, 3
+; RV32V-NEXT:    mv a7, a6
+; RV32V-NEXT:    slli a6, a6, 1
+; RV32V-NEXT:    add a7, a7, a6
+; RV32V-NEXT:    slli a6, a6, 1
+; RV32V-NEXT:    add a7, a7, a6
+; RV32V-NEXT:    slli a6, a6, 1
+; RV32V-NEXT:    add a6, a6, a7
+; RV32V-NEXT:    add a6, sp, a6
+; RV32V-NEXT:    addi a6, a6, 16
+; RV32V-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v16, v0, s11
+; RV32V-NEXT:    csrr a6, vlenb
+; RV32V-NEXT:    slli a6, a6, 4
+; RV32V-NEXT:    mv a7, a6
+; RV32V-NEXT:    slli a6, a6, 1
+; RV32V-NEXT:    add a7, a7, a6
+; RV32V-NEXT:    slli a6, a6, 1
+; RV32V-NEXT:    add a6, a6, a7
+; RV32V-NEXT:    add a6, sp, a6
+; RV32V-NEXT:    addi a6, a6, 16
+; RV32V-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v16, v0, ra
+; RV32V-NEXT:    csrr a6, vlenb
+; RV32V-NEXT:    slli a6, a6, 3
+; RV32V-NEXT:    mv a7, a6
+; RV32V-NEXT:    slli a6, a6, 2
+; RV32V-NEXT:    add a7, a7, a6
+; RV32V-NEXT:    slli a6, a6, 1
+; RV32V-NEXT:    add a6, a6, a7
+; RV32V-NEXT:    add a6, sp, a6
+; RV32V-NEXT:    addi a6, a6, 16
+; RV32V-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v16, v0, a5
+; RV32V-NEXT:    csrr a5, vlenb
+; RV32V-NEXT:    slli a5, a5, 5
+; RV32V-NEXT:    mv a6, a5
+; RV32V-NEXT:    slli a5, a5, 1
+; RV32V-NEXT:    add a5, a5, a6
+; RV32V-NEXT:    add a5, sp, a5
+; RV32V-NEXT:    addi a5, a5, 16
+; RV32V-NEXT:    vs8r.v v16, (a5) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v16, v0, t6
+; RV32V-NEXT:    csrr a5, vlenb
+; RV32V-NEXT:    slli a5, a5, 3
+; RV32V-NEXT:    mv a6, a5
+; RV32V-NEXT:    slli a5, a5, 1
+; RV32V-NEXT:    add a6, a6, a5
+; RV32V-NEXT:    slli a5, a5, 2
+; RV32V-NEXT:    add a5, a5, a6
+; RV32V-NEXT:    add a5, sp, a5
+; RV32V-NEXT:    addi a5, a5, 16
+; RV32V-NEXT:    vs8r.v v16, (a5) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v16, v0, a4
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 4
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 2
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vi v16, v0, 2
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vi v24, v0, 1
+; RV32V-NEXT:    vand.vi v16, v0, 4
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 2
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vi v16, v0, 8
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v16, v0, t2
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 6
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v16, v0, s5
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a5, a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vmul.vv v16, v8, v24
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 4
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 2
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v24, v8, v16
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v0, v8, v16
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 4
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 4
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 2
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a5, a5, a4
+; RV32V-NEXT:    slli a4, a4, 4
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 5
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 8
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 4
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a5, a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a5, a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a5, a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 4
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a5, a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a5, a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    addi a4, sp, 16
+; RV32V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 2
+; RV32V-NEXT:    add a5, a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a5, a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 2
+; RV32V-NEXT:    add a5, a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a5, a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 5
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a5, a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 5
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a5, a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a5, a5, a4
+; RV32V-NEXT:    slli a4, a4, 2
+; RV32V-NEXT:    add a5, a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a5, a5, a4
+; RV32V-NEXT:    slli a4, a4, 2
+; RV32V-NEXT:    add a5, a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 4
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 2
+; RV32V-NEXT:    add a5, a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 4
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 2
+; RV32V-NEXT:    add a5, a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    add a5, a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    add a5, a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 6
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 6
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a5, a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a5, a5, a4
+; RV32V-NEXT:    slli a4, a4, 2
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a5, a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a5, a5, a4
+; RV32V-NEXT:    slli a4, a4, 2
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 4
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a5, a5, a4
+; RV32V-NEXT:    slli a4, a4, 2
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 4
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a5, a5, a4
+; RV32V-NEXT:    slli a4, a4, 2
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 2
+; RV32V-NEXT:    add a5, a5, a4
+; RV32V-NEXT:    slli a4, a4, 2
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 2
+; RV32V-NEXT:    add a5, a5, a4
+; RV32V-NEXT:    slli a4, a4, 2
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 5
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 2
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 5
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 2
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a5, a5, a4
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a5, a5, a4
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 4
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 4
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 4
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 4
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 7
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 7
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a5, a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a5, a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a5, a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a5, a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 4
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a5, a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 4
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a5, a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 2
+; RV32V-NEXT:    add a5, a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a5, a5, a4
+; RV32V-NEXT:    slli a4, a4, 4
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 5
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 4
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 4
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a5, a5, a4
+; RV32V-NEXT:    slli a4, a4, 2
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 5
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 4
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 2
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 8
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 6
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a5, a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a5, a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a5, a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a5, a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v8, v16
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 4
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a5, a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a5, a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vs8r.v v8, (a4) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 4
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v8
+; RV32V-NEXT:    vxor.vv v16, v16, v24
+; RV32V-NEXT:    vxor.vv v16, v16, v0
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 2
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v8
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v8
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v8
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 4
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v8
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v24, v16, v8
+; RV32V-NEXT:    addi a4, sp, 16
+; RV32V-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v24, v24, v8
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 2
+; RV32V-NEXT:    add a5, a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a5, a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v24, v24, v8
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 5
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a5, a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v24, v24, v8
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a5, a5, a4
+; RV32V-NEXT:    slli a4, a4, 2
+; RV32V-NEXT:    add a5, a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v24, v24, v8
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 4
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 2
+; RV32V-NEXT:    add a5, a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v24, v24, v8
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    add a5, a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v24, v24, v8
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 6
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v24, v24, v8
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a5, a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a5, a5, a4
+; RV32V-NEXT:    slli a4, a4, 2
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v24, v24, v8
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 4
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a5, a5, a4
+; RV32V-NEXT:    slli a4, a4, 2
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v24, v24, v8
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 2
+; RV32V-NEXT:    add a5, a5, a4
+; RV32V-NEXT:    slli a4, a4, 2
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v24, v24, v8
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 5
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 2
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v24, v24, v8
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a5, a5, a4
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v24, v24, v8
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 4
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v24, v24, v8
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 4
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v24, v24, v8
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 7
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v24, v24, v8
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a5, a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a5, a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v0, v24, v8
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 4
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a5, a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v0, v0, v8
+; RV32V-NEXT:    vsll.vi v16, v16, 24
+; RV32V-NEXT:    vand.vx v8, v24, a0
+; RV32V-NEXT:    vsll.vi v8, v8, 8
+; RV32V-NEXT:    vor.vv v16, v16, v8
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a5, a5, a4
+; RV32V-NEXT:    slli a4, a4, 4
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v0, v8
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 4
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 4
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 5
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 8
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a5, a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a5, a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a5, a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 4
+; RV32V-NEXT:    mv a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a5, a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a5, a5, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add a4, a4, a5
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    vsrl.vi v24, v24, 8
+; RV32V-NEXT:    vand.vx v24, v24, a0
+; RV32V-NEXT:    vsrl.vi v8, v8, 24
+; RV32V-NEXT:    vor.vv v8, v24, v8
+; RV32V-NEXT:    vor.vv v8, v16, v8
+; RV32V-NEXT:    vsrl.vi v16, v8, 4
+; RV32V-NEXT:    vand.vx v8, v8, a3
+; RV32V-NEXT:    vand.vx v16, v16, a3
+; RV32V-NEXT:    vsll.vi v8, v8, 4
+; RV32V-NEXT:    vor.vv v8, v16, v8
+; RV32V-NEXT:    vsrl.vi v16, v8, 2
+; RV32V-NEXT:    vand.vx v8, v8, a2
+; RV32V-NEXT:    vand.vx v16, v16, a2
+; RV32V-NEXT:    vsll.vi v8, v8, 2
+; RV32V-NEXT:    vor.vv v8, v16, v8
+; RV32V-NEXT:    vsrl.vi v16, v8, 1
+; RV32V-NEXT:    vand.vx v8, v8, a1
+; RV32V-NEXT:    vand.vx v16, v16, a1
+; RV32V-NEXT:    vadd.vv v8, v8, v8
+; RV32V-NEXT:    vor.vv v8, v16, v8
+; RV32V-NEXT:    vsrl.vi v8, v8, 1
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add sp, sp, a0
+; RV32V-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s1, 68(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s2, 64(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s3, 60(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s4, 56(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s5, 52(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s6, 48(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s7, 44(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s8, 40(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s9, 36(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s10, 32(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s11, 28(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    addi sp, sp, 80
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: clmulh_nxv16i32_vv:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    addi sp, sp, -144
+; RV64V-NEXT:    sd ra, 136(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s0, 128(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s1, 120(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s2, 112(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s3, 104(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s4, 96(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s5, 88(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s6, 80(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s7, 72(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s8, 64(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s9, 56(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s10, 48(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s11, 40(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    sub sp, sp, a0
+; RV64V-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; RV64V-NEXT:    vsrl.vi v24, v8, 8
+; RV64V-NEXT:    lui a0, 16
+; RV64V-NEXT:    vsrl.vi v0, v8, 24
+; RV64V-NEXT:    addi a0, a0, -256
+; RV64V-NEXT:    vand.vx v24, v24, a0
+; RV64V-NEXT:    vor.vv v24, v24, v0
+; RV64V-NEXT:    csrr a1, vlenb
+; RV64V-NEXT:    slli a1, a1, 3
+; RV64V-NEXT:    mv a2, a1
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    add a2, a2, a1
+; RV64V-NEXT:    slli a1, a1, 4
+; RV64V-NEXT:    add a1, a1, a2
+; RV64V-NEXT:    add a1, sp, a1
+; RV64V-NEXT:    addi a1, a1, 32
+; RV64V-NEXT:    vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vsrl.vi v0, v16, 8
+; RV64V-NEXT:    vand.vx v0, v0, a0
+; RV64V-NEXT:    vsrl.vi v24, v16, 24
+; RV64V-NEXT:    vor.vv v0, v0, v24
+; RV64V-NEXT:    vsll.vi v24, v8, 24
+; RV64V-NEXT:    vand.vx v8, v8, a0
+; RV64V-NEXT:    vsll.vi v8, v8, 8
+; RV64V-NEXT:    vor.vv v8, v24, v8
+; RV64V-NEXT:    vsll.vi v24, v16, 24
+; RV64V-NEXT:    vand.vx v16, v16, a0
+; RV64V-NEXT:    vsll.vi v16, v16, 8
+; RV64V-NEXT:    vor.vv v16, v24, v16
+; RV64V-NEXT:    lui a1, 61681
+; RV64V-NEXT:    lui a2, 209715
+; RV64V-NEXT:    lui t6, 349525
+; RV64V-NEXT:    li s5, 16
+; RV64V-NEXT:    li t2, 32
+; RV64V-NEXT:    li a7, 256
+; RV64V-NEXT:    li t0, 512
+; RV64V-NEXT:    li t1, 1024
+; RV64V-NEXT:    li s6, 1
+; RV64V-NEXT:    lui t3, 1
+; RV64V-NEXT:    lui t4, 2
+; RV64V-NEXT:    lui t5, 4
+; RV64V-NEXT:    lui s0, 8
+; RV64V-NEXT:    lui s1, 32
+; RV64V-NEXT:    lui s2, 64
+; RV64V-NEXT:    lui s3, 128
+; RV64V-NEXT:    lui s4, 256
+; RV64V-NEXT:    lui s7, 512
+; RV64V-NEXT:    lui s8, 1024
+; RV64V-NEXT:    lui s9, 2048
+; RV64V-NEXT:    lui s10, 4096
+; RV64V-NEXT:    lui s11, 8192
+; RV64V-NEXT:    lui ra, 16384
+; RV64V-NEXT:    lui a5, 32768
+; RV64V-NEXT:    addi a3, a1, -241
+; RV64V-NEXT:    addi a2, a2, 819
+; RV64V-NEXT:    addi a1, t6, 1365
+; RV64V-NEXT:    vor.vv v16, v16, v0
+; RV64V-NEXT:    vsrl.vi v24, v16, 4
+; RV64V-NEXT:    vand.vx v16, v16, a3
+; RV64V-NEXT:    vand.vx v24, v24, a3
+; RV64V-NEXT:    vsll.vi v16, v16, 4
+; RV64V-NEXT:    vor.vv v16, v24, v16
+; RV64V-NEXT:    vsrl.vi v24, v16, 2
+; RV64V-NEXT:    vand.vx v16, v16, a2
+; RV64V-NEXT:    vand.vx v24, v24, a2
+; RV64V-NEXT:    vsll.vi v16, v16, 2
+; RV64V-NEXT:    vor.vv v16, v24, v16
+; RV64V-NEXT:    vsrl.vi v24, v16, 1
+; RV64V-NEXT:    vand.vx v16, v16, a1
+; RV64V-NEXT:    vand.vx v24, v24, a1
+; RV64V-NEXT:    vadd.vv v16, v16, v16
+; RV64V-NEXT:    vor.vv v0, v24, v16
+; RV64V-NEXT:    vand.vx v16, v0, s5
+; RV64V-NEXT:    sd a0, 8(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    mv a4, a0
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    add a0, a0, a4
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 32
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    lui t6, 65536
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a4, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a4, a4, a0
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    add a0, a0, a4
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 32
+; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vor.vv v8, v8, v16
+; RV64V-NEXT:    vsrl.vi v16, v8, 4
+; RV64V-NEXT:    vand.vx v8, v8, a3
+; RV64V-NEXT:    vand.vx v16, v16, a3
+; RV64V-NEXT:    vsll.vi v8, v8, 4
+; RV64V-NEXT:    vor.vv v8, v16, v8
+; RV64V-NEXT:    vsrl.vi v16, v8, 2
+; RV64V-NEXT:    vand.vx v8, v8, a2
+; RV64V-NEXT:    vand.vx v16, v16, a2
+; RV64V-NEXT:    vsll.vi v8, v8, 2
+; RV64V-NEXT:    vor.vv v8, v16, v8
+; RV64V-NEXT:    vsrl.vi v16, v8, 1
+; RV64V-NEXT:    vand.vx v8, v8, a1
+; RV64V-NEXT:    vand.vx v16, v16, a1
+; RV64V-NEXT:    vadd.vv v8, v8, v8
+; RV64V-NEXT:    vor.vv v8, v16, v8
+; RV64V-NEXT:    vand.vx v16, v0, t2
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a4, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a4, a4, a0
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    add a0, a0, a4
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 32
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    lui a4, 131072
+; RV64V-NEXT:    slli s6, s6, 11
+; RV64V-NEXT:    lui t2, 262144
+; RV64V-NEXT:    lui s5, 524288
+; RV64V-NEXT:    li a6, 64
+; RV64V-NEXT:    vand.vx v16, v0, a6
+; RV64V-NEXT:    csrr a6, vlenb
+; RV64V-NEXT:    slli a6, a6, 3
+; RV64V-NEXT:    mv a0, a6
+; RV64V-NEXT:    slli a6, a6, 5
+; RV64V-NEXT:    add a6, a6, a0
+; RV64V-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    add a6, sp, a6
+; RV64V-NEXT:    addi a6, a6, 32
+; RV64V-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    li a6, 128
+; RV64V-NEXT:    vand.vx v16, v0, a6
+; RV64V-NEXT:    csrr a6, vlenb
+; RV64V-NEXT:    slli a6, a6, 8
+; RV64V-NEXT:    add a6, sp, a6
+; RV64V-NEXT:    addi a6, a6, 32
+; RV64V-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, a7
+; RV64V-NEXT:    csrr a6, vlenb
+; RV64V-NEXT:    slli a6, a6, 3
+; RV64V-NEXT:    mv a7, a6
+; RV64V-NEXT:    slli a6, a6, 1
+; RV64V-NEXT:    add a7, a7, a6
+; RV64V-NEXT:    slli a6, a6, 1
+; RV64V-NEXT:    add a7, a7, a6
+; RV64V-NEXT:    slli a6, a6, 1
+; RV64V-NEXT:    add a7, a7, a6
+; RV64V-NEXT:    slli a6, a6, 1
+; RV64V-NEXT:    add a6, a6, a7
+; RV64V-NEXT:    add a6, sp, a6
+; RV64V-NEXT:    addi a6, a6, 32
+; RV64V-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, t0
+; RV64V-NEXT:    csrr a6, vlenb
+; RV64V-NEXT:    slli a6, a6, 4
+; RV64V-NEXT:    mv a7, a6
+; RV64V-NEXT:    slli a6, a6, 1
+; RV64V-NEXT:    add a7, a7, a6
+; RV64V-NEXT:    slli a6, a6, 1
+; RV64V-NEXT:    add a7, a7, a6
+; RV64V-NEXT:    slli a6, a6, 1
+; RV64V-NEXT:    add a6, a6, a7
+; RV64V-NEXT:    add a6, sp, a6
+; RV64V-NEXT:    addi a6, a6, 32
+; RV64V-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, t1
+; RV64V-NEXT:    csrr a6, vlenb
+; RV64V-NEXT:    slli a6, a6, 3
+; RV64V-NEXT:    mv a7, a6
+; RV64V-NEXT:    slli a6, a6, 2
+; RV64V-NEXT:    add a7, a7, a6
+; RV64V-NEXT:    slli a6, a6, 1
+; RV64V-NEXT:    add a7, a7, a6
+; RV64V-NEXT:    slli a6, a6, 1
+; RV64V-NEXT:    add a6, a6, a7
+; RV64V-NEXT:    add a6, sp, a6
+; RV64V-NEXT:    addi a6, a6, 32
+; RV64V-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, s6
+; RV64V-NEXT:    csrr a6, vlenb
+; RV64V-NEXT:    slli a6, a6, 5
+; RV64V-NEXT:    mv a7, a6
+; RV64V-NEXT:    slli a6, a6, 1
+; RV64V-NEXT:    add a7, a7, a6
+; RV64V-NEXT:    slli a6, a6, 1
+; RV64V-NEXT:    add a6, a6, a7
+; RV64V-NEXT:    add a6, sp, a6
+; RV64V-NEXT:    addi a6, a6, 32
+; RV64V-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, t3
+; RV64V-NEXT:    csrr a6, vlenb
+; RV64V-NEXT:    slli a6, a6, 3
+; RV64V-NEXT:    mv a7, a6
+; RV64V-NEXT:    slli a6, a6, 1
+; RV64V-NEXT:    add a7, a7, a6
+; RV64V-NEXT:    slli a6, a6, 2
+; RV64V-NEXT:    add a7, a7, a6
+; RV64V-NEXT:    slli a6, a6, 1
+; RV64V-NEXT:    add a6, a6, a7
+; RV64V-NEXT:    add a6, sp, a6
+; RV64V-NEXT:    addi a6, a6, 32
+; RV64V-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, t4
+; RV64V-NEXT:    csrr a6, vlenb
+; RV64V-NEXT:    slli a6, a6, 4
+; RV64V-NEXT:    mv a7, a6
+; RV64V-NEXT:    slli a6, a6, 2
+; RV64V-NEXT:    add a7, a7, a6
+; RV64V-NEXT:    slli a6, a6, 1
+; RV64V-NEXT:    add a6, a6, a7
+; RV64V-NEXT:    add a6, sp, a6
+; RV64V-NEXT:    addi a6, a6, 32
+; RV64V-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, t5
+; RV64V-NEXT:    csrr a6, vlenb
+; RV64V-NEXT:    slli a6, a6, 3
+; RV64V-NEXT:    mv a7, a6
+; RV64V-NEXT:    slli a6, a6, 3
+; RV64V-NEXT:    add a7, a7, a6
+; RV64V-NEXT:    slli a6, a6, 1
+; RV64V-NEXT:    add a6, a6, a7
+; RV64V-NEXT:    add a6, sp, a6
+; RV64V-NEXT:    addi a6, a6, 32
+; RV64V-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, s0
+; RV64V-NEXT:    csrr a6, vlenb
+; RV64V-NEXT:    slli a6, a6, 6
+; RV64V-NEXT:    mv a7, a6
+; RV64V-NEXT:    slli a6, a6, 1
+; RV64V-NEXT:    add a6, a6, a7
+; RV64V-NEXT:    add a6, sp, a6
+; RV64V-NEXT:    addi a6, a6, 32
+; RV64V-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    lui a6, 16
+; RV64V-NEXT:    vand.vx v16, v0, a6
+; RV64V-NEXT:    csrr a6, vlenb
+; RV64V-NEXT:    slli a6, a6, 3
+; RV64V-NEXT:    mv a7, a6
+; RV64V-NEXT:    slli a6, a6, 1
+; RV64V-NEXT:    add a7, a7, a6
+; RV64V-NEXT:    slli a6, a6, 1
+; RV64V-NEXT:    add a7, a7, a6
+; RV64V-NEXT:    slli a6, a6, 2
+; RV64V-NEXT:    add a6, a6, a7
+; RV64V-NEXT:    add a6, sp, a6
+; RV64V-NEXT:    addi a6, a6, 32
+; RV64V-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, s1
+; RV64V-NEXT:    csrr a6, vlenb
+; RV64V-NEXT:    slli a6, a6, 4
+; RV64V-NEXT:    mv a7, a6
+; RV64V-NEXT:    slli a6, a6, 1
+; RV64V-NEXT:    add a7, a7, a6
+; RV64V-NEXT:    slli a6, a6, 2
+; RV64V-NEXT:    add a6, a6, a7
+; RV64V-NEXT:    add a6, sp, a6
+; RV64V-NEXT:    addi a6, a6, 32
+; RV64V-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, s2
+; RV64V-NEXT:    csrr a6, vlenb
+; RV64V-NEXT:    slli a6, a6, 3
+; RV64V-NEXT:    mv a7, a6
+; RV64V-NEXT:    slli a6, a6, 2
+; RV64V-NEXT:    add a7, a7, a6
+; RV64V-NEXT:    slli a6, a6, 2
+; RV64V-NEXT:    add a6, a6, a7
+; RV64V-NEXT:    add a6, sp, a6
+; RV64V-NEXT:    addi a6, a6, 32
+; RV64V-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, s3
+; RV64V-NEXT:    csrr a6, vlenb
+; RV64V-NEXT:    slli a6, a6, 5
+; RV64V-NEXT:    mv a7, a6
+; RV64V-NEXT:    slli a6, a6, 2
+; RV64V-NEXT:    add a6, a6, a7
+; RV64V-NEXT:    add a6, sp, a6
+; RV64V-NEXT:    addi a6, a6, 32
+; RV64V-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, s4
+; RV64V-NEXT:    csrr a6, vlenb
+; RV64V-NEXT:    slli a6, a6, 3
+; RV64V-NEXT:    mv a7, a6
+; RV64V-NEXT:    slli a6, a6, 1
+; RV64V-NEXT:    add a7, a7, a6
+; RV64V-NEXT:    slli a6, a6, 3
+; RV64V-NEXT:    add a6, a6, a7
+; RV64V-NEXT:    add a6, sp, a6
+; RV64V-NEXT:    addi a6, a6, 32
+; RV64V-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, s7
+; RV64V-NEXT:    csrr a6, vlenb
+; RV64V-NEXT:    slli a6, a6, 4
+; RV64V-NEXT:    mv a7, a6
+; RV64V-NEXT:    slli a6, a6, 3
+; RV64V-NEXT:    add a6, a6, a7
+; RV64V-NEXT:    add a6, sp, a6
+; RV64V-NEXT:    addi a6, a6, 32
+; RV64V-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, s8
+; RV64V-NEXT:    csrr a6, vlenb
+; RV64V-NEXT:    slli a6, a6, 3
+; RV64V-NEXT:    mv a7, a6
+; RV64V-NEXT:    slli a6, a6, 4
+; RV64V-NEXT:    add a6, a6, a7
+; RV64V-NEXT:    add a6, sp, a6
+; RV64V-NEXT:    addi a6, a6, 32
+; RV64V-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, s9
+; RV64V-NEXT:    csrr a6, vlenb
+; RV64V-NEXT:    slli a6, a6, 7
+; RV64V-NEXT:    add a6, sp, a6
+; RV64V-NEXT:    addi a6, a6, 32
+; RV64V-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, s10
+; RV64V-NEXT:    csrr a6, vlenb
+; RV64V-NEXT:    slli a6, a6, 3
+; RV64V-NEXT:    mv a7, a6
+; RV64V-NEXT:    slli a6, a6, 1
+; RV64V-NEXT:    add a7, a7, a6
+; RV64V-NEXT:    slli a6, a6, 1
+; RV64V-NEXT:    add a7, a7, a6
+; RV64V-NEXT:    slli a6, a6, 1
+; RV64V-NEXT:    add a6, a6, a7
+; RV64V-NEXT:    add a6, sp, a6
+; RV64V-NEXT:    addi a6, a6, 32
+; RV64V-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, s11
+; RV64V-NEXT:    csrr a6, vlenb
+; RV64V-NEXT:    slli a6, a6, 4
+; RV64V-NEXT:    mv a7, a6
+; RV64V-NEXT:    slli a6, a6, 1
+; RV64V-NEXT:    add a7, a7, a6
+; RV64V-NEXT:    slli a6, a6, 1
+; RV64V-NEXT:    add a6, a6, a7
+; RV64V-NEXT:    add a6, sp, a6
+; RV64V-NEXT:    addi a6, a6, 32
+; RV64V-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, ra
+; RV64V-NEXT:    csrr a6, vlenb
+; RV64V-NEXT:    slli a6, a6, 3
+; RV64V-NEXT:    mv a7, a6
+; RV64V-NEXT:    slli a6, a6, 2
+; RV64V-NEXT:    add a7, a7, a6
+; RV64V-NEXT:    slli a6, a6, 1
+; RV64V-NEXT:    add a6, a6, a7
+; RV64V-NEXT:    add a6, sp, a6
+; RV64V-NEXT:    addi a6, a6, 32
+; RV64V-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, a5
+; RV64V-NEXT:    csrr a5, vlenb
+; RV64V-NEXT:    slli a5, a5, 5
+; RV64V-NEXT:    mv a6, a5
+; RV64V-NEXT:    slli a5, a5, 1
+; RV64V-NEXT:    add a5, a5, a6
+; RV64V-NEXT:    add a5, sp, a5
+; RV64V-NEXT:    addi a5, a5, 32
+; RV64V-NEXT:    vs8r.v v16, (a5) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, t6
+; RV64V-NEXT:    csrr a5, vlenb
+; RV64V-NEXT:    slli a5, a5, 3
+; RV64V-NEXT:    mv a6, a5
+; RV64V-NEXT:    slli a5, a5, 1
+; RV64V-NEXT:    add a6, a6, a5
+; RV64V-NEXT:    slli a5, a5, 2
+; RV64V-NEXT:    add a5, a5, a6
+; RV64V-NEXT:    add a5, sp, a5
+; RV64V-NEXT:    addi a5, a5, 32
+; RV64V-NEXT:    vs8r.v v16, (a5) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, a4
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 4
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vi v16, v0, 2
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vi v24, v0, 1
+; RV64V-NEXT:    vand.vi v16, v0, 4
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vi v16, v0, 8
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, t2
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 6
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, s5
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vmul.vv v16, v8, v24
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 4
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v24, v8, v16
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v0, v8, v16
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 4
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 4
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 4
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 5
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 8
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 4
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 4
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    addi a4, sp, 32
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 5
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 5
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 4
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 4
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 6
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 6
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 4
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 4
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 5
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 5
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 4
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 4
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 4
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 4
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 7
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 7
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 4
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 4
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 4
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 5
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 4
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 4
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 5
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 4
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 8
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 6
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v8, v8, v16
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 4
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vs8r.v v8, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 4
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v16, v16, v8
+; RV64V-NEXT:    vxor.vv v16, v16, v24
+; RV64V-NEXT:    vxor.vv v16, v16, v0
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v16, v16, v8
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v16, v16, v8
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v16, v16, v8
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 4
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v16, v16, v8
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v24, v16, v8
+; RV64V-NEXT:    addi a4, sp, 32
+; RV64V-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v24, v24, v8
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v24, v24, v8
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 5
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v24, v24, v8
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v24, v24, v8
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 4
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v24, v24, v8
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v24, v24, v8
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 6
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v24, v24, v8
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v24, v24, v8
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 4
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v24, v24, v8
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v24, v24, v8
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 5
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v24, v24, v8
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v24, v24, v8
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 4
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v24, v24, v8
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 4
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v24, v24, v8
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 7
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v24, v24, v8
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v0, v24, v8
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 4
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v0, v0, v8
+; RV64V-NEXT:    vsll.vi v16, v16, 24
+; RV64V-NEXT:    vand.vx v8, v24, a0
+; RV64V-NEXT:    vsll.vi v8, v8, 8
+; RV64V-NEXT:    vor.vv v16, v16, v8
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 4
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v8, v0, v8
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 4
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 4
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 5
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 8
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 4
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vsrl.vi v24, v24, 8
+; RV64V-NEXT:    vand.vx v24, v24, a0
+; RV64V-NEXT:    vsrl.vi v8, v8, 24
+; RV64V-NEXT:    vor.vv v8, v24, v8
+; RV64V-NEXT:    vor.vv v8, v16, v8
+; RV64V-NEXT:    vsrl.vi v16, v8, 4
+; RV64V-NEXT:    vand.vx v8, v8, a3
+; RV64V-NEXT:    vand.vx v16, v16, a3
+; RV64V-NEXT:    vsll.vi v8, v8, 4
+; RV64V-NEXT:    vor.vv v8, v16, v8
+; RV64V-NEXT:    vsrl.vi v16, v8, 2
+; RV64V-NEXT:    vand.vx v8, v8, a2
+; RV64V-NEXT:    vand.vx v16, v16, a2
+; RV64V-NEXT:    vsll.vi v8, v8, 2
+; RV64V-NEXT:    vor.vv v8, v16, v8
+; RV64V-NEXT:    vsrl.vi v16, v8, 1
+; RV64V-NEXT:    vand.vx v8, v8, a1
+; RV64V-NEXT:    vand.vx v16, v16, a1
+; RV64V-NEXT:    vadd.vv v8, v8, v8
+; RV64V-NEXT:    vor.vv v8, v16, v8
+; RV64V-NEXT:    vsrl.vi v8, v8, 1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add sp, sp, a0
+; RV64V-NEXT:    ld ra, 136(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s0, 128(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s1, 120(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s2, 112(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s3, 104(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s4, 96(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s5, 88(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s6, 80(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s7, 72(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s8, 64(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s9, 56(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s10, 48(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s11, 40(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    addi sp, sp, 144
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC64-LABEL: clmulh_nxv16i32_vv:
+; RV32ZVBC64:       # %bb.0:
+; RV32ZVBC64-NEXT:    addi sp, sp, -80
+; RV32ZVBC64-NEXT:    sw ra, 76(sp) # 4-byte Folded Spill
+; RV32ZVBC64-NEXT:    sw s0, 72(sp) # 4-byte Folded Spill
+; RV32ZVBC64-NEXT:    sw s1, 68(sp) # 4-byte Folded Spill
+; RV32ZVBC64-NEXT:    sw s2, 64(sp) # 4-byte Folded Spill
+; RV32ZVBC64-NEXT:    sw s3, 60(sp) # 4-byte Folded Spill
+; RV32ZVBC64-NEXT:    sw s4, 56(sp) # 4-byte Folded Spill
+; RV32ZVBC64-NEXT:    sw s5, 52(sp) # 4-byte Folded Spill
+; RV32ZVBC64-NEXT:    sw s6, 48(sp) # 4-byte Folded Spill
+; RV32ZVBC64-NEXT:    sw s7, 44(sp) # 4-byte Folded Spill
+; RV32ZVBC64-NEXT:    sw s8, 40(sp) # 4-byte Folded Spill
+; RV32ZVBC64-NEXT:    sw s9, 36(sp) # 4-byte Folded Spill
+; RV32ZVBC64-NEXT:    sw s10, 32(sp) # 4-byte Folded Spill
+; RV32ZVBC64-NEXT:    sw s11, 28(sp) # 4-byte Folded Spill
+; RV32ZVBC64-NEXT:    csrr a0, vlenb
+; RV32ZVBC64-NEXT:    slli a0, a0, 5
+; RV32ZVBC64-NEXT:    mv a1, a0
+; RV32ZVBC64-NEXT:    slli a0, a0, 3
+; RV32ZVBC64-NEXT:    add a0, a0, a1
+; RV32ZVBC64-NEXT:    sub sp, sp, a0
+; RV32ZVBC64-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; RV32ZVBC64-NEXT:    vsrl.vi v24, v8, 8
+; RV32ZVBC64-NEXT:    lui a0, 16
+; RV32ZVBC64-NEXT:    vsrl.vi v0, v8, 24
+; RV32ZVBC64-NEXT:    addi a0, a0, -256
+; RV32ZVBC64-NEXT:    vand.vx v24, v24, a0
+; RV32ZVBC64-NEXT:    vor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    csrr a1, vlenb
+; RV32ZVBC64-NEXT:    slli a1, a1, 3
+; RV32ZVBC64-NEXT:    mv a2, a1
+; RV32ZVBC64-NEXT:    slli a1, a1, 1
+; RV32ZVBC64-NEXT:    add a2, a2, a1
+; RV32ZVBC64-NEXT:    slli a1, a1, 4
+; RV32ZVBC64-NEXT:    add a1, a1, a2
+; RV32ZVBC64-NEXT:    add a1, sp, a1
+; RV32ZVBC64-NEXT:    addi a1, a1, 16
+; RV32ZVBC64-NEXT:    vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    vsrl.vi v0, v16, 8
+; RV32ZVBC64-NEXT:    vand.vx v0, v0, a0
+; RV32ZVBC64-NEXT:    vsrl.vi v24, v16, 24
+; RV32ZVBC64-NEXT:    vor.vv v0, v0, v24
+; RV32ZVBC64-NEXT:    vsll.vi v24, v8, 24
+; RV32ZVBC64-NEXT:    vand.vx v8, v8, a0
+; RV32ZVBC64-NEXT:    vsll.vi v8, v8, 8
+; RV32ZVBC64-NEXT:    vor.vv v8, v24, v8
+; RV32ZVBC64-NEXT:    vsll.vi v24, v16, 24
+; RV32ZVBC64-NEXT:    vand.vx v16, v16, a0
+; RV32ZVBC64-NEXT:    vsll.vi v16, v16, 8
+; RV32ZVBC64-NEXT:    vor.vv v16, v24, v16
+; RV32ZVBC64-NEXT:    lui a1, 61681
+; RV32ZVBC64-NEXT:    lui a2, 209715
+; RV32ZVBC64-NEXT:    lui t6, 349525
+; RV32ZVBC64-NEXT:    li s5, 16
+; RV32ZVBC64-NEXT:    li t2, 32
+; RV32ZVBC64-NEXT:    li a7, 256
+; RV32ZVBC64-NEXT:    li t0, 512
+; RV32ZVBC64-NEXT:    li t1, 1024
+; RV32ZVBC64-NEXT:    li s6, 1
+; RV32ZVBC64-NEXT:    lui t3, 1
+; RV32ZVBC64-NEXT:    lui t4, 2
+; RV32ZVBC64-NEXT:    lui t5, 4
+; RV32ZVBC64-NEXT:    lui s0, 8
+; RV32ZVBC64-NEXT:    lui s1, 32
+; RV32ZVBC64-NEXT:    lui s2, 64
+; RV32ZVBC64-NEXT:    lui s3, 128
+; RV32ZVBC64-NEXT:    lui s4, 256
+; RV32ZVBC64-NEXT:    lui s7, 512
+; RV32ZVBC64-NEXT:    lui s8, 1024
+; RV32ZVBC64-NEXT:    lui s9, 2048
+; RV32ZVBC64-NEXT:    lui s10, 4096
+; RV32ZVBC64-NEXT:    lui s11, 8192
+; RV32ZVBC64-NEXT:    lui ra, 16384
+; RV32ZVBC64-NEXT:    lui a5, 32768
+; RV32ZVBC64-NEXT:    addi a3, a1, -241
+; RV32ZVBC64-NEXT:    addi a2, a2, 819
+; RV32ZVBC64-NEXT:    addi a1, t6, 1365
+; RV32ZVBC64-NEXT:    vor.vv v16, v16, v0
+; RV32ZVBC64-NEXT:    vsrl.vi v24, v16, 4
+; RV32ZVBC64-NEXT:    vand.vx v16, v16, a3
+; RV32ZVBC64-NEXT:    vand.vx v24, v24, a3
+; RV32ZVBC64-NEXT:    vsll.vi v16, v16, 4
+; RV32ZVBC64-NEXT:    vor.vv v16, v24, v16
+; RV32ZVBC64-NEXT:    vsrl.vi v24, v16, 2
+; RV32ZVBC64-NEXT:    vand.vx v16, v16, a2
+; RV32ZVBC64-NEXT:    vand.vx v24, v24, a2
+; RV32ZVBC64-NEXT:    vsll.vi v16, v16, 2
+; RV32ZVBC64-NEXT:    vor.vv v16, v24, v16
+; RV32ZVBC64-NEXT:    vsrl.vi v24, v16, 1
+; RV32ZVBC64-NEXT:    vand.vx v16, v16, a1
+; RV32ZVBC64-NEXT:    vand.vx v24, v24, a1
+; RV32ZVBC64-NEXT:    vadd.vv v16, v16, v16
+; RV32ZVBC64-NEXT:    vor.vv v0, v24, v16
+; RV32ZVBC64-NEXT:    vand.vx v16, v0, s5
+; RV32ZVBC64-NEXT:    sw a0, 4(sp) # 4-byte Folded Spill
+; RV32ZVBC64-NEXT:    csrr a0, vlenb
+; RV32ZVBC64-NEXT:    slli a0, a0, 4
+; RV32ZVBC64-NEXT:    mv a4, a0
+; RV32ZVBC64-NEXT:    slli a0, a0, 4
+; RV32ZVBC64-NEXT:    add a0, a0, a4
+; RV32ZVBC64-NEXT:    add a0, sp, a0
+; RV32ZVBC64-NEXT:    addi a0, a0, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    lui t6, 65536
+; RV32ZVBC64-NEXT:    csrr a0, vlenb
+; RV32ZVBC64-NEXT:    slli a0, a0, 3
+; RV32ZVBC64-NEXT:    mv a4, a0
+; RV32ZVBC64-NEXT:    slli a0, a0, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a0
+; RV32ZVBC64-NEXT:    slli a0, a0, 4
+; RV32ZVBC64-NEXT:    add a0, a0, a4
+; RV32ZVBC64-NEXT:    add a0, sp, a0
+; RV32ZVBC64-NEXT:    addi a0, a0, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vor.vv v8, v8, v16
+; RV32ZVBC64-NEXT:    vsrl.vi v16, v8, 4
+; RV32ZVBC64-NEXT:    vand.vx v8, v8, a3
+; RV32ZVBC64-NEXT:    vand.vx v16, v16, a3
+; RV32ZVBC64-NEXT:    vsll.vi v8, v8, 4
+; RV32ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV32ZVBC64-NEXT:    vsrl.vi v16, v8, 2
+; RV32ZVBC64-NEXT:    vand.vx v8, v8, a2
+; RV32ZVBC64-NEXT:    vand.vx v16, v16, a2
+; RV32ZVBC64-NEXT:    vsll.vi v8, v8, 2
+; RV32ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV32ZVBC64-NEXT:    vsrl.vi v16, v8, 1
+; RV32ZVBC64-NEXT:    vand.vx v8, v8, a1
+; RV32ZVBC64-NEXT:    vand.vx v16, v16, a1
+; RV32ZVBC64-NEXT:    vadd.vv v8, v8, v8
+; RV32ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV32ZVBC64-NEXT:    vand.vx v16, v0, t2
+; RV32ZVBC64-NEXT:    csrr a0, vlenb
+; RV32ZVBC64-NEXT:    slli a0, a0, 3
+; RV32ZVBC64-NEXT:    mv a4, a0
+; RV32ZVBC64-NEXT:    slli a0, a0, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a0
+; RV32ZVBC64-NEXT:    slli a0, a0, 4
+; RV32ZVBC64-NEXT:    add a0, a0, a4
+; RV32ZVBC64-NEXT:    add a0, sp, a0
+; RV32ZVBC64-NEXT:    addi a0, a0, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    lui a4, 131072
+; RV32ZVBC64-NEXT:    slli s6, s6, 11
+; RV32ZVBC64-NEXT:    lui t2, 262144
+; RV32ZVBC64-NEXT:    lui s5, 524288
+; RV32ZVBC64-NEXT:    li a6, 64
+; RV32ZVBC64-NEXT:    vand.vx v16, v0, a6
+; RV32ZVBC64-NEXT:    csrr a6, vlenb
+; RV32ZVBC64-NEXT:    slli a6, a6, 3
+; RV32ZVBC64-NEXT:    mv a0, a6
+; RV32ZVBC64-NEXT:    slli a6, a6, 5
+; RV32ZVBC64-NEXT:    add a6, a6, a0
+; RV32ZVBC64-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
+; RV32ZVBC64-NEXT:    add a6, sp, a6
+; RV32ZVBC64-NEXT:    addi a6, a6, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    li a6, 128
+; RV32ZVBC64-NEXT:    vand.vx v16, v0, a6
+; RV32ZVBC64-NEXT:    csrr a6, vlenb
+; RV32ZVBC64-NEXT:    slli a6, a6, 8
+; RV32ZVBC64-NEXT:    add a6, sp, a6
+; RV32ZVBC64-NEXT:    addi a6, a6, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    vand.vx v16, v0, a7
+; RV32ZVBC64-NEXT:    csrr a6, vlenb
+; RV32ZVBC64-NEXT:    slli a6, a6, 3
+; RV32ZVBC64-NEXT:    mv a7, a6
+; RV32ZVBC64-NEXT:    slli a6, a6, 1
+; RV32ZVBC64-NEXT:    add a7, a7, a6
+; RV32ZVBC64-NEXT:    slli a6, a6, 1
+; RV32ZVBC64-NEXT:    add a7, a7, a6
+; RV32ZVBC64-NEXT:    slli a6, a6, 1
+; RV32ZVBC64-NEXT:    add a7, a7, a6
+; RV32ZVBC64-NEXT:    slli a6, a6, 1
+; RV32ZVBC64-NEXT:    add a6, a6, a7
+; RV32ZVBC64-NEXT:    add a6, sp, a6
+; RV32ZVBC64-NEXT:    addi a6, a6, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    vand.vx v16, v0, t0
+; RV32ZVBC64-NEXT:    csrr a6, vlenb
+; RV32ZVBC64-NEXT:    slli a6, a6, 4
+; RV32ZVBC64-NEXT:    mv a7, a6
+; RV32ZVBC64-NEXT:    slli a6, a6, 1
+; RV32ZVBC64-NEXT:    add a7, a7, a6
+; RV32ZVBC64-NEXT:    slli a6, a6, 1
+; RV32ZVBC64-NEXT:    add a7, a7, a6
+; RV32ZVBC64-NEXT:    slli a6, a6, 1
+; RV32ZVBC64-NEXT:    add a6, a6, a7
+; RV32ZVBC64-NEXT:    add a6, sp, a6
+; RV32ZVBC64-NEXT:    addi a6, a6, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    vand.vx v16, v0, t1
+; RV32ZVBC64-NEXT:    csrr a6, vlenb
+; RV32ZVBC64-NEXT:    slli a6, a6, 3
+; RV32ZVBC64-NEXT:    mv a7, a6
+; RV32ZVBC64-NEXT:    slli a6, a6, 2
+; RV32ZVBC64-NEXT:    add a7, a7, a6
+; RV32ZVBC64-NEXT:    slli a6, a6, 1
+; RV32ZVBC64-NEXT:    add a7, a7, a6
+; RV32ZVBC64-NEXT:    slli a6, a6, 1
+; RV32ZVBC64-NEXT:    add a6, a6, a7
+; RV32ZVBC64-NEXT:    add a6, sp, a6
+; RV32ZVBC64-NEXT:    addi a6, a6, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    vand.vx v16, v0, s6
+; RV32ZVBC64-NEXT:    csrr a6, vlenb
+; RV32ZVBC64-NEXT:    slli a6, a6, 5
+; RV32ZVBC64-NEXT:    mv a7, a6
+; RV32ZVBC64-NEXT:    slli a6, a6, 1
+; RV32ZVBC64-NEXT:    add a7, a7, a6
+; RV32ZVBC64-NEXT:    slli a6, a6, 1
+; RV32ZVBC64-NEXT:    add a6, a6, a7
+; RV32ZVBC64-NEXT:    add a6, sp, a6
+; RV32ZVBC64-NEXT:    addi a6, a6, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    vand.vx v16, v0, t3
+; RV32ZVBC64-NEXT:    csrr a6, vlenb
+; RV32ZVBC64-NEXT:    slli a6, a6, 3
+; RV32ZVBC64-NEXT:    mv a7, a6
+; RV32ZVBC64-NEXT:    slli a6, a6, 1
+; RV32ZVBC64-NEXT:    add a7, a7, a6
+; RV32ZVBC64-NEXT:    slli a6, a6, 2
+; RV32ZVBC64-NEXT:    add a7, a7, a6
+; RV32ZVBC64-NEXT:    slli a6, a6, 1
+; RV32ZVBC64-NEXT:    add a6, a6, a7
+; RV32ZVBC64-NEXT:    add a6, sp, a6
+; RV32ZVBC64-NEXT:    addi a6, a6, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    vand.vx v16, v0, t4
+; RV32ZVBC64-NEXT:    csrr a6, vlenb
+; RV32ZVBC64-NEXT:    slli a6, a6, 4
+; RV32ZVBC64-NEXT:    mv a7, a6
+; RV32ZVBC64-NEXT:    slli a6, a6, 2
+; RV32ZVBC64-NEXT:    add a7, a7, a6
+; RV32ZVBC64-NEXT:    slli a6, a6, 1
+; RV32ZVBC64-NEXT:    add a6, a6, a7
+; RV32ZVBC64-NEXT:    add a6, sp, a6
+; RV32ZVBC64-NEXT:    addi a6, a6, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    vand.vx v16, v0, t5
+; RV32ZVBC64-NEXT:    csrr a6, vlenb
+; RV32ZVBC64-NEXT:    slli a6, a6, 3
+; RV32ZVBC64-NEXT:    mv a7, a6
+; RV32ZVBC64-NEXT:    slli a6, a6, 3
+; RV32ZVBC64-NEXT:    add a7, a7, a6
+; RV32ZVBC64-NEXT:    slli a6, a6, 1
+; RV32ZVBC64-NEXT:    add a6, a6, a7
+; RV32ZVBC64-NEXT:    add a6, sp, a6
+; RV32ZVBC64-NEXT:    addi a6, a6, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    vand.vx v16, v0, s0
+; RV32ZVBC64-NEXT:    csrr a6, vlenb
+; RV32ZVBC64-NEXT:    slli a6, a6, 6
+; RV32ZVBC64-NEXT:    mv a7, a6
+; RV32ZVBC64-NEXT:    slli a6, a6, 1
+; RV32ZVBC64-NEXT:    add a6, a6, a7
+; RV32ZVBC64-NEXT:    add a6, sp, a6
+; RV32ZVBC64-NEXT:    addi a6, a6, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    lui a6, 16
+; RV32ZVBC64-NEXT:    vand.vx v16, v0, a6
+; RV32ZVBC64-NEXT:    csrr a6, vlenb
+; RV32ZVBC64-NEXT:    slli a6, a6, 3
+; RV32ZVBC64-NEXT:    mv a7, a6
+; RV32ZVBC64-NEXT:    slli a6, a6, 1
+; RV32ZVBC64-NEXT:    add a7, a7, a6
+; RV32ZVBC64-NEXT:    slli a6, a6, 1
+; RV32ZVBC64-NEXT:    add a7, a7, a6
+; RV32ZVBC64-NEXT:    slli a6, a6, 2
+; RV32ZVBC64-NEXT:    add a6, a6, a7
+; RV32ZVBC64-NEXT:    add a6, sp, a6
+; RV32ZVBC64-NEXT:    addi a6, a6, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    vand.vx v16, v0, s1
+; RV32ZVBC64-NEXT:    csrr a6, vlenb
+; RV32ZVBC64-NEXT:    slli a6, a6, 4
+; RV32ZVBC64-NEXT:    mv a7, a6
+; RV32ZVBC64-NEXT:    slli a6, a6, 1
+; RV32ZVBC64-NEXT:    add a7, a7, a6
+; RV32ZVBC64-NEXT:    slli a6, a6, 2
+; RV32ZVBC64-NEXT:    add a6, a6, a7
+; RV32ZVBC64-NEXT:    add a6, sp, a6
+; RV32ZVBC64-NEXT:    addi a6, a6, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    vand.vx v16, v0, s2
+; RV32ZVBC64-NEXT:    csrr a6, vlenb
+; RV32ZVBC64-NEXT:    slli a6, a6, 3
+; RV32ZVBC64-NEXT:    mv a7, a6
+; RV32ZVBC64-NEXT:    slli a6, a6, 2
+; RV32ZVBC64-NEXT:    add a7, a7, a6
+; RV32ZVBC64-NEXT:    slli a6, a6, 2
+; RV32ZVBC64-NEXT:    add a6, a6, a7
+; RV32ZVBC64-NEXT:    add a6, sp, a6
+; RV32ZVBC64-NEXT:    addi a6, a6, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    vand.vx v16, v0, s3
+; RV32ZVBC64-NEXT:    csrr a6, vlenb
+; RV32ZVBC64-NEXT:    slli a6, a6, 5
+; RV32ZVBC64-NEXT:    mv a7, a6
+; RV32ZVBC64-NEXT:    slli a6, a6, 2
+; RV32ZVBC64-NEXT:    add a6, a6, a7
+; RV32ZVBC64-NEXT:    add a6, sp, a6
+; RV32ZVBC64-NEXT:    addi a6, a6, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    vand.vx v16, v0, s4
+; RV32ZVBC64-NEXT:    csrr a6, vlenb
+; RV32ZVBC64-NEXT:    slli a6, a6, 3
+; RV32ZVBC64-NEXT:    mv a7, a6
+; RV32ZVBC64-NEXT:    slli a6, a6, 1
+; RV32ZVBC64-NEXT:    add a7, a7, a6
+; RV32ZVBC64-NEXT:    slli a6, a6, 3
+; RV32ZVBC64-NEXT:    add a6, a6, a7
+; RV32ZVBC64-NEXT:    add a6, sp, a6
+; RV32ZVBC64-NEXT:    addi a6, a6, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    vand.vx v16, v0, s7
+; RV32ZVBC64-NEXT:    csrr a6, vlenb
+; RV32ZVBC64-NEXT:    slli a6, a6, 4
+; RV32ZVBC64-NEXT:    mv a7, a6
+; RV32ZVBC64-NEXT:    slli a6, a6, 3
+; RV32ZVBC64-NEXT:    add a6, a6, a7
+; RV32ZVBC64-NEXT:    add a6, sp, a6
+; RV32ZVBC64-NEXT:    addi a6, a6, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    vand.vx v16, v0, s8
+; RV32ZVBC64-NEXT:    csrr a6, vlenb
+; RV32ZVBC64-NEXT:    slli a6, a6, 3
+; RV32ZVBC64-NEXT:    mv a7, a6
+; RV32ZVBC64-NEXT:    slli a6, a6, 4
+; RV32ZVBC64-NEXT:    add a6, a6, a7
+; RV32ZVBC64-NEXT:    add a6, sp, a6
+; RV32ZVBC64-NEXT:    addi a6, a6, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    vand.vx v16, v0, s9
+; RV32ZVBC64-NEXT:    csrr a6, vlenb
+; RV32ZVBC64-NEXT:    slli a6, a6, 7
+; RV32ZVBC64-NEXT:    add a6, sp, a6
+; RV32ZVBC64-NEXT:    addi a6, a6, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    vand.vx v16, v0, s10
+; RV32ZVBC64-NEXT:    csrr a6, vlenb
+; RV32ZVBC64-NEXT:    slli a6, a6, 3
+; RV32ZVBC64-NEXT:    mv a7, a6
+; RV32ZVBC64-NEXT:    slli a6, a6, 1
+; RV32ZVBC64-NEXT:    add a7, a7, a6
+; RV32ZVBC64-NEXT:    slli a6, a6, 1
+; RV32ZVBC64-NEXT:    add a7, a7, a6
+; RV32ZVBC64-NEXT:    slli a6, a6, 1
+; RV32ZVBC64-NEXT:    add a6, a6, a7
+; RV32ZVBC64-NEXT:    add a6, sp, a6
+; RV32ZVBC64-NEXT:    addi a6, a6, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    vand.vx v16, v0, s11
+; RV32ZVBC64-NEXT:    csrr a6, vlenb
+; RV32ZVBC64-NEXT:    slli a6, a6, 4
+; RV32ZVBC64-NEXT:    mv a7, a6
+; RV32ZVBC64-NEXT:    slli a6, a6, 1
+; RV32ZVBC64-NEXT:    add a7, a7, a6
+; RV32ZVBC64-NEXT:    slli a6, a6, 1
+; RV32ZVBC64-NEXT:    add a6, a6, a7
+; RV32ZVBC64-NEXT:    add a6, sp, a6
+; RV32ZVBC64-NEXT:    addi a6, a6, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    vand.vx v16, v0, ra
+; RV32ZVBC64-NEXT:    csrr a6, vlenb
+; RV32ZVBC64-NEXT:    slli a6, a6, 3
+; RV32ZVBC64-NEXT:    mv a7, a6
+; RV32ZVBC64-NEXT:    slli a6, a6, 2
+; RV32ZVBC64-NEXT:    add a7, a7, a6
+; RV32ZVBC64-NEXT:    slli a6, a6, 1
+; RV32ZVBC64-NEXT:    add a6, a6, a7
+; RV32ZVBC64-NEXT:    add a6, sp, a6
+; RV32ZVBC64-NEXT:    addi a6, a6, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    vand.vx v16, v0, a5
+; RV32ZVBC64-NEXT:    csrr a5, vlenb
+; RV32ZVBC64-NEXT:    slli a5, a5, 5
+; RV32ZVBC64-NEXT:    mv a6, a5
+; RV32ZVBC64-NEXT:    slli a5, a5, 1
+; RV32ZVBC64-NEXT:    add a5, a5, a6
+; RV32ZVBC64-NEXT:    add a5, sp, a5
+; RV32ZVBC64-NEXT:    addi a5, a5, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a5) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    vand.vx v16, v0, t6
+; RV32ZVBC64-NEXT:    csrr a5, vlenb
+; RV32ZVBC64-NEXT:    slli a5, a5, 3
+; RV32ZVBC64-NEXT:    mv a6, a5
+; RV32ZVBC64-NEXT:    slli a5, a5, 1
+; RV32ZVBC64-NEXT:    add a6, a6, a5
+; RV32ZVBC64-NEXT:    slli a5, a5, 2
+; RV32ZVBC64-NEXT:    add a5, a5, a6
+; RV32ZVBC64-NEXT:    add a5, sp, a5
+; RV32ZVBC64-NEXT:    addi a5, a5, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a5) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    vand.vx v16, v0, a4
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 4
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 2
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    vand.vi v16, v0, 2
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 3
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 3
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    vand.vi v24, v0, 1
+; RV32ZVBC64-NEXT:    vand.vi v16, v0, 4
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 3
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 2
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    vand.vi v16, v0, 8
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    vand.vx v16, v0, t2
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 6
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    vand.vx v16, v0, s5
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 3
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a5, a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 3
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 3
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 3
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 3
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    vmul.vv v16, v8, v24
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 4
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 3
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 2
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v24, v8, v16
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v0, v8, v16
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 4
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 4
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 3
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 2
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 3
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a5, a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 4
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 3
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 5
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 3
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 8
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 4
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 3
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a5, a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a5, a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a5, a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 3
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 4
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a5, a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a5, a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC64-NEXT:    addi a4, sp, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 3
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 2
+; RV32ZVBC64-NEXT:    add a5, a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a5, a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 3
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 2
+; RV32ZVBC64-NEXT:    add a5, a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a5, a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 5
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a5, a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 5
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a5, a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 3
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a5, a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 2
+; RV32ZVBC64-NEXT:    add a5, a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 3
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a5, a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 2
+; RV32ZVBC64-NEXT:    add a5, a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 4
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 2
+; RV32ZVBC64-NEXT:    add a5, a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 4
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 2
+; RV32ZVBC64-NEXT:    add a5, a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 3
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 3
+; RV32ZVBC64-NEXT:    add a5, a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 3
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 3
+; RV32ZVBC64-NEXT:    add a5, a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 6
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 6
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 3
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a5, a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a5, a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 2
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 3
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a5, a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a5, a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 2
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 4
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a5, a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 2
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 4
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a5, a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 2
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 3
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 2
+; RV32ZVBC64-NEXT:    add a5, a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 2
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 3
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 2
+; RV32ZVBC64-NEXT:    add a5, a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 2
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 5
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 2
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 5
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 2
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 3
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a5, a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 3
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 3
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a5, a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 3
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 4
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 3
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 4
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 3
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 3
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 4
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 3
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 4
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 7
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 7
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 3
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a5, a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a5, a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 3
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a5, a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a5, a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 4
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a5, a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 4
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a5, a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 3
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 2
+; RV32ZVBC64-NEXT:    add a5, a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 3
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a5, a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 4
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 5
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 4
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 4
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 3
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a5, a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 2
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 3
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 5
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 4
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 2
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 8
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 6
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 3
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a5, a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a5, a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a5, a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 3
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a5, a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vmul.vv v8, v8, v16
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 4
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a5, a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a5, a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vs8r.v v8, (a4) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 3
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 3
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 4
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v8
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v0
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 3
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 2
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v8
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v8
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 3
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v8
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 4
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v8
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 3
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vxor.vv v24, v16, v8
+; RV32ZVBC64-NEXT:    addi a4, sp, 16
+; RV32ZVBC64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v8
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 3
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 2
+; RV32ZVBC64-NEXT:    add a5, a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a5, a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v8
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 5
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a5, a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v8
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 3
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a5, a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 2
+; RV32ZVBC64-NEXT:    add a5, a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v8
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 4
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 2
+; RV32ZVBC64-NEXT:    add a5, a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v8
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 3
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 3
+; RV32ZVBC64-NEXT:    add a5, a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v8
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 6
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v8
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 3
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a5, a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a5, a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 2
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v8
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 4
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a5, a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 2
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v8
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 3
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 2
+; RV32ZVBC64-NEXT:    add a5, a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 2
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v8
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 5
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 2
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v8
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 3
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a5, a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 3
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v8
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 4
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 3
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v8
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 3
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 4
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v8
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 7
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v8
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 3
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a5, a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a5, a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vxor.vv v0, v24, v8
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 4
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a5, a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vxor.vv v0, v0, v8
+; RV32ZVBC64-NEXT:    vsll.vi v16, v16, 24
+; RV32ZVBC64-NEXT:    vand.vx v8, v24, a0
+; RV32ZVBC64-NEXT:    vsll.vi v8, v8, 8
+; RV32ZVBC64-NEXT:    vor.vv v16, v16, v8
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 3
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a5, a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 4
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vxor.vv v8, v0, v8
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 4
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 4
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vxor.vv v8, v8, v0
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 3
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 5
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vxor.vv v8, v8, v0
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 8
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vxor.vv v8, v8, v0
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 3
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a5, a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a5, a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a5, a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vxor.vv v8, v8, v0
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 4
+; RV32ZVBC64-NEXT:    mv a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a5, a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a5, a5, a4
+; RV32ZVBC64-NEXT:    slli a4, a4, 1
+; RV32ZVBC64-NEXT:    add a4, a4, a5
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vxor.vv v8, v8, v0
+; RV32ZVBC64-NEXT:    vsrl.vi v24, v24, 8
+; RV32ZVBC64-NEXT:    vand.vx v24, v24, a0
+; RV32ZVBC64-NEXT:    vsrl.vi v8, v8, 24
+; RV32ZVBC64-NEXT:    vor.vv v8, v24, v8
+; RV32ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV32ZVBC64-NEXT:    vsrl.vi v16, v8, 4
+; RV32ZVBC64-NEXT:    vand.vx v8, v8, a3
+; RV32ZVBC64-NEXT:    vand.vx v16, v16, a3
+; RV32ZVBC64-NEXT:    vsll.vi v8, v8, 4
+; RV32ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV32ZVBC64-NEXT:    vsrl.vi v16, v8, 2
+; RV32ZVBC64-NEXT:    vand.vx v8, v8, a2
+; RV32ZVBC64-NEXT:    vand.vx v16, v16, a2
+; RV32ZVBC64-NEXT:    vsll.vi v8, v8, 2
+; RV32ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV32ZVBC64-NEXT:    vsrl.vi v16, v8, 1
+; RV32ZVBC64-NEXT:    vand.vx v8, v8, a1
+; RV32ZVBC64-NEXT:    vand.vx v16, v16, a1
+; RV32ZVBC64-NEXT:    vadd.vv v8, v8, v8
+; RV32ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV32ZVBC64-NEXT:    vsrl.vi v8, v8, 1
+; RV32ZVBC64-NEXT:    csrr a0, vlenb
+; RV32ZVBC64-NEXT:    slli a0, a0, 5
+; RV32ZVBC64-NEXT:    mv a1, a0
+; RV32ZVBC64-NEXT:    slli a0, a0, 3
+; RV32ZVBC64-NEXT:    add a0, a0, a1
+; RV32ZVBC64-NEXT:    add sp, sp, a0
+; RV32ZVBC64-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
+; RV32ZVBC64-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
+; RV32ZVBC64-NEXT:    lw s1, 68(sp) # 4-byte Folded Reload
+; RV32ZVBC64-NEXT:    lw s2, 64(sp) # 4-byte Folded Reload
+; RV32ZVBC64-NEXT:    lw s3, 60(sp) # 4-byte Folded Reload
+; RV32ZVBC64-NEXT:    lw s4, 56(sp) # 4-byte Folded Reload
+; RV32ZVBC64-NEXT:    lw s5, 52(sp) # 4-byte Folded Reload
+; RV32ZVBC64-NEXT:    lw s6, 48(sp) # 4-byte Folded Reload
+; RV32ZVBC64-NEXT:    lw s7, 44(sp) # 4-byte Folded Reload
+; RV32ZVBC64-NEXT:    lw s8, 40(sp) # 4-byte Folded Reload
+; RV32ZVBC64-NEXT:    lw s9, 36(sp) # 4-byte Folded Reload
+; RV32ZVBC64-NEXT:    lw s10, 32(sp) # 4-byte Folded Reload
+; RV32ZVBC64-NEXT:    lw s11, 28(sp) # 4-byte Folded Reload
+; RV32ZVBC64-NEXT:    addi sp, sp, 80
+; RV32ZVBC64-NEXT:    ret
+;
+; RV64ZVBC64-LABEL: clmulh_nxv16i32_vv:
+; RV64ZVBC64:       # %bb.0:
+; RV64ZVBC64-NEXT:    addi sp, sp, -144
+; RV64ZVBC64-NEXT:    sd ra, 136(sp) # 8-byte Folded Spill
+; RV64ZVBC64-NEXT:    sd s0, 128(sp) # 8-byte Folded Spill
+; RV64ZVBC64-NEXT:    sd s1, 120(sp) # 8-byte Folded Spill
+; RV64ZVBC64-NEXT:    sd s2, 112(sp) # 8-byte Folded Spill
+; RV64ZVBC64-NEXT:    sd s3, 104(sp) # 8-byte Folded Spill
+; RV64ZVBC64-NEXT:    sd s4, 96(sp) # 8-byte Folded Spill
+; RV64ZVBC64-NEXT:    sd s5, 88(sp) # 8-byte Folded Spill
+; RV64ZVBC64-NEXT:    sd s6, 80(sp) # 8-byte Folded Spill
+; RV64ZVBC64-NEXT:    sd s7, 72(sp) # 8-byte Folded Spill
+; RV64ZVBC64-NEXT:    sd s8, 64(sp) # 8-byte Folded Spill
+; RV64ZVBC64-NEXT:    sd s9, 56(sp) # 8-byte Folded Spill
+; RV64ZVBC64-NEXT:    sd s10, 48(sp) # 8-byte Folded Spill
+; RV64ZVBC64-NEXT:    sd s11, 40(sp) # 8-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a0, vlenb
+; RV64ZVBC64-NEXT:    slli a0, a0, 5
+; RV64ZVBC64-NEXT:    mv a1, a0
+; RV64ZVBC64-NEXT:    slli a0, a0, 3
+; RV64ZVBC64-NEXT:    add a0, a0, a1
+; RV64ZVBC64-NEXT:    sub sp, sp, a0
+; RV64ZVBC64-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; RV64ZVBC64-NEXT:    vsrl.vi v24, v8, 8
+; RV64ZVBC64-NEXT:    lui a0, 16
+; RV64ZVBC64-NEXT:    vsrl.vi v0, v8, 24
+; RV64ZVBC64-NEXT:    addi a0, a0, -256
+; RV64ZVBC64-NEXT:    vand.vx v24, v24, a0
+; RV64ZVBC64-NEXT:    vor.vv v24, v24, v0
+; RV64ZVBC64-NEXT:    csrr a1, vlenb
+; RV64ZVBC64-NEXT:    slli a1, a1, 3
+; RV64ZVBC64-NEXT:    mv a2, a1
+; RV64ZVBC64-NEXT:    slli a1, a1, 1
+; RV64ZVBC64-NEXT:    add a2, a2, a1
+; RV64ZVBC64-NEXT:    slli a1, a1, 4
+; RV64ZVBC64-NEXT:    add a1, a1, a2
+; RV64ZVBC64-NEXT:    add a1, sp, a1
+; RV64ZVBC64-NEXT:    addi a1, a1, 32
+; RV64ZVBC64-NEXT:    vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vsrl.vi v0, v16, 8
+; RV64ZVBC64-NEXT:    vand.vx v0, v0, a0
+; RV64ZVBC64-NEXT:    vsrl.vi v24, v16, 24
+; RV64ZVBC64-NEXT:    vor.vv v0, v0, v24
+; RV64ZVBC64-NEXT:    vsll.vi v24, v8, 24
+; RV64ZVBC64-NEXT:    vand.vx v8, v8, a0
+; RV64ZVBC64-NEXT:    vsll.vi v8, v8, 8
+; RV64ZVBC64-NEXT:    vor.vv v8, v24, v8
+; RV64ZVBC64-NEXT:    vsll.vi v24, v16, 24
+; RV64ZVBC64-NEXT:    vand.vx v16, v16, a0
+; RV64ZVBC64-NEXT:    vsll.vi v16, v16, 8
+; RV64ZVBC64-NEXT:    vor.vv v16, v24, v16
+; RV64ZVBC64-NEXT:    lui a1, 61681
+; RV64ZVBC64-NEXT:    lui a2, 209715
+; RV64ZVBC64-NEXT:    lui t6, 349525
+; RV64ZVBC64-NEXT:    li s5, 16
+; RV64ZVBC64-NEXT:    li t2, 32
+; RV64ZVBC64-NEXT:    li a7, 256
+; RV64ZVBC64-NEXT:    li t0, 512
+; RV64ZVBC64-NEXT:    li t1, 1024
+; RV64ZVBC64-NEXT:    li s6, 1
+; RV64ZVBC64-NEXT:    lui t3, 1
+; RV64ZVBC64-NEXT:    lui t4, 2
+; RV64ZVBC64-NEXT:    lui t5, 4
+; RV64ZVBC64-NEXT:    lui s0, 8
+; RV64ZVBC64-NEXT:    lui s1, 32
+; RV64ZVBC64-NEXT:    lui s2, 64
+; RV64ZVBC64-NEXT:    lui s3, 128
+; RV64ZVBC64-NEXT:    lui s4, 256
+; RV64ZVBC64-NEXT:    lui s7, 512
+; RV64ZVBC64-NEXT:    lui s8, 1024
+; RV64ZVBC64-NEXT:    lui s9, 2048
+; RV64ZVBC64-NEXT:    lui s10, 4096
+; RV64ZVBC64-NEXT:    lui s11, 8192
+; RV64ZVBC64-NEXT:    lui ra, 16384
+; RV64ZVBC64-NEXT:    lui a5, 32768
+; RV64ZVBC64-NEXT:    addi a3, a1, -241
+; RV64ZVBC64-NEXT:    addi a2, a2, 819
+; RV64ZVBC64-NEXT:    addi a1, t6, 1365
+; RV64ZVBC64-NEXT:    vor.vv v16, v16, v0
+; RV64ZVBC64-NEXT:    vsrl.vi v24, v16, 4
+; RV64ZVBC64-NEXT:    vand.vx v16, v16, a3
+; RV64ZVBC64-NEXT:    vand.vx v24, v24, a3
+; RV64ZVBC64-NEXT:    vsll.vi v16, v16, 4
+; RV64ZVBC64-NEXT:    vor.vv v16, v24, v16
+; RV64ZVBC64-NEXT:    vsrl.vi v24, v16, 2
+; RV64ZVBC64-NEXT:    vand.vx v16, v16, a2
+; RV64ZVBC64-NEXT:    vand.vx v24, v24, a2
+; RV64ZVBC64-NEXT:    vsll.vi v16, v16, 2
+; RV64ZVBC64-NEXT:    vor.vv v16, v24, v16
+; RV64ZVBC64-NEXT:    vsrl.vi v24, v16, 1
+; RV64ZVBC64-NEXT:    vand.vx v16, v16, a1
+; RV64ZVBC64-NEXT:    vand.vx v24, v24, a1
+; RV64ZVBC64-NEXT:    vadd.vv v16, v16, v16
+; RV64ZVBC64-NEXT:    vor.vv v0, v24, v16
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, s5
+; RV64ZVBC64-NEXT:    sd a0, 8(sp) # 8-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a0, vlenb
+; RV64ZVBC64-NEXT:    slli a0, a0, 4
+; RV64ZVBC64-NEXT:    mv a4, a0
+; RV64ZVBC64-NEXT:    slli a0, a0, 4
+; RV64ZVBC64-NEXT:    add a0, a0, a4
+; RV64ZVBC64-NEXT:    add a0, sp, a0
+; RV64ZVBC64-NEXT:    addi a0, a0, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    lui t6, 65536
+; RV64ZVBC64-NEXT:    csrr a0, vlenb
+; RV64ZVBC64-NEXT:    slli a0, a0, 3
+; RV64ZVBC64-NEXT:    mv a4, a0
+; RV64ZVBC64-NEXT:    slli a0, a0, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a0
+; RV64ZVBC64-NEXT:    slli a0, a0, 4
+; RV64ZVBC64-NEXT:    add a0, a0, a4
+; RV64ZVBC64-NEXT:    add a0, sp, a0
+; RV64ZVBC64-NEXT:    addi a0, a0, 32
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vor.vv v8, v8, v16
+; RV64ZVBC64-NEXT:    vsrl.vi v16, v8, 4
+; RV64ZVBC64-NEXT:    vand.vx v8, v8, a3
+; RV64ZVBC64-NEXT:    vand.vx v16, v16, a3
+; RV64ZVBC64-NEXT:    vsll.vi v8, v8, 4
+; RV64ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV64ZVBC64-NEXT:    vsrl.vi v16, v8, 2
+; RV64ZVBC64-NEXT:    vand.vx v8, v8, a2
+; RV64ZVBC64-NEXT:    vand.vx v16, v16, a2
+; RV64ZVBC64-NEXT:    vsll.vi v8, v8, 2
+; RV64ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV64ZVBC64-NEXT:    vsrl.vi v16, v8, 1
+; RV64ZVBC64-NEXT:    vand.vx v8, v8, a1
+; RV64ZVBC64-NEXT:    vand.vx v16, v16, a1
+; RV64ZVBC64-NEXT:    vadd.vv v8, v8, v8
+; RV64ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, t2
+; RV64ZVBC64-NEXT:    csrr a0, vlenb
+; RV64ZVBC64-NEXT:    slli a0, a0, 3
+; RV64ZVBC64-NEXT:    mv a4, a0
+; RV64ZVBC64-NEXT:    slli a0, a0, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a0
+; RV64ZVBC64-NEXT:    slli a0, a0, 4
+; RV64ZVBC64-NEXT:    add a0, a0, a4
+; RV64ZVBC64-NEXT:    add a0, sp, a0
+; RV64ZVBC64-NEXT:    addi a0, a0, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    lui a4, 131072
+; RV64ZVBC64-NEXT:    slli s6, s6, 11
+; RV64ZVBC64-NEXT:    lui t2, 262144
+; RV64ZVBC64-NEXT:    lui s5, 524288
+; RV64ZVBC64-NEXT:    li a6, 64
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, a6
+; RV64ZVBC64-NEXT:    csrr a6, vlenb
+; RV64ZVBC64-NEXT:    slli a6, a6, 3
+; RV64ZVBC64-NEXT:    mv a0, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 5
+; RV64ZVBC64-NEXT:    add a6, a6, a0
+; RV64ZVBC64-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
+; RV64ZVBC64-NEXT:    add a6, sp, a6
+; RV64ZVBC64-NEXT:    addi a6, a6, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    li a6, 128
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, a6
+; RV64ZVBC64-NEXT:    csrr a6, vlenb
+; RV64ZVBC64-NEXT:    slli a6, a6, 8
+; RV64ZVBC64-NEXT:    add a6, sp, a6
+; RV64ZVBC64-NEXT:    addi a6, a6, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, a7
+; RV64ZVBC64-NEXT:    csrr a6, vlenb
+; RV64ZVBC64-NEXT:    slli a6, a6, 3
+; RV64ZVBC64-NEXT:    mv a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 1
+; RV64ZVBC64-NEXT:    add a7, a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 1
+; RV64ZVBC64-NEXT:    add a7, a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 1
+; RV64ZVBC64-NEXT:    add a7, a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 1
+; RV64ZVBC64-NEXT:    add a6, a6, a7
+; RV64ZVBC64-NEXT:    add a6, sp, a6
+; RV64ZVBC64-NEXT:    addi a6, a6, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, t0
+; RV64ZVBC64-NEXT:    csrr a6, vlenb
+; RV64ZVBC64-NEXT:    slli a6, a6, 4
+; RV64ZVBC64-NEXT:    mv a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 1
+; RV64ZVBC64-NEXT:    add a7, a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 1
+; RV64ZVBC64-NEXT:    add a7, a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 1
+; RV64ZVBC64-NEXT:    add a6, a6, a7
+; RV64ZVBC64-NEXT:    add a6, sp, a6
+; RV64ZVBC64-NEXT:    addi a6, a6, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, t1
+; RV64ZVBC64-NEXT:    csrr a6, vlenb
+; RV64ZVBC64-NEXT:    slli a6, a6, 3
+; RV64ZVBC64-NEXT:    mv a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 2
+; RV64ZVBC64-NEXT:    add a7, a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 1
+; RV64ZVBC64-NEXT:    add a7, a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 1
+; RV64ZVBC64-NEXT:    add a6, a6, a7
+; RV64ZVBC64-NEXT:    add a6, sp, a6
+; RV64ZVBC64-NEXT:    addi a6, a6, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, s6
+; RV64ZVBC64-NEXT:    csrr a6, vlenb
+; RV64ZVBC64-NEXT:    slli a6, a6, 5
+; RV64ZVBC64-NEXT:    mv a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 1
+; RV64ZVBC64-NEXT:    add a7, a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 1
+; RV64ZVBC64-NEXT:    add a6, a6, a7
+; RV64ZVBC64-NEXT:    add a6, sp, a6
+; RV64ZVBC64-NEXT:    addi a6, a6, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, t3
+; RV64ZVBC64-NEXT:    csrr a6, vlenb
+; RV64ZVBC64-NEXT:    slli a6, a6, 3
+; RV64ZVBC64-NEXT:    mv a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 1
+; RV64ZVBC64-NEXT:    add a7, a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 2
+; RV64ZVBC64-NEXT:    add a7, a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 1
+; RV64ZVBC64-NEXT:    add a6, a6, a7
+; RV64ZVBC64-NEXT:    add a6, sp, a6
+; RV64ZVBC64-NEXT:    addi a6, a6, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, t4
+; RV64ZVBC64-NEXT:    csrr a6, vlenb
+; RV64ZVBC64-NEXT:    slli a6, a6, 4
+; RV64ZVBC64-NEXT:    mv a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 2
+; RV64ZVBC64-NEXT:    add a7, a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 1
+; RV64ZVBC64-NEXT:    add a6, a6, a7
+; RV64ZVBC64-NEXT:    add a6, sp, a6
+; RV64ZVBC64-NEXT:    addi a6, a6, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, t5
+; RV64ZVBC64-NEXT:    csrr a6, vlenb
+; RV64ZVBC64-NEXT:    slli a6, a6, 3
+; RV64ZVBC64-NEXT:    mv a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 3
+; RV64ZVBC64-NEXT:    add a7, a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 1
+; RV64ZVBC64-NEXT:    add a6, a6, a7
+; RV64ZVBC64-NEXT:    add a6, sp, a6
+; RV64ZVBC64-NEXT:    addi a6, a6, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, s0
+; RV64ZVBC64-NEXT:    csrr a6, vlenb
+; RV64ZVBC64-NEXT:    slli a6, a6, 6
+; RV64ZVBC64-NEXT:    mv a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 1
+; RV64ZVBC64-NEXT:    add a6, a6, a7
+; RV64ZVBC64-NEXT:    add a6, sp, a6
+; RV64ZVBC64-NEXT:    addi a6, a6, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    lui a6, 16
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, a6
+; RV64ZVBC64-NEXT:    csrr a6, vlenb
+; RV64ZVBC64-NEXT:    slli a6, a6, 3
+; RV64ZVBC64-NEXT:    mv a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 1
+; RV64ZVBC64-NEXT:    add a7, a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 1
+; RV64ZVBC64-NEXT:    add a7, a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 2
+; RV64ZVBC64-NEXT:    add a6, a6, a7
+; RV64ZVBC64-NEXT:    add a6, sp, a6
+; RV64ZVBC64-NEXT:    addi a6, a6, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, s1
+; RV64ZVBC64-NEXT:    csrr a6, vlenb
+; RV64ZVBC64-NEXT:    slli a6, a6, 4
+; RV64ZVBC64-NEXT:    mv a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 1
+; RV64ZVBC64-NEXT:    add a7, a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 2
+; RV64ZVBC64-NEXT:    add a6, a6, a7
+; RV64ZVBC64-NEXT:    add a6, sp, a6
+; RV64ZVBC64-NEXT:    addi a6, a6, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, s2
+; RV64ZVBC64-NEXT:    csrr a6, vlenb
+; RV64ZVBC64-NEXT:    slli a6, a6, 3
+; RV64ZVBC64-NEXT:    mv a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 2
+; RV64ZVBC64-NEXT:    add a7, a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 2
+; RV64ZVBC64-NEXT:    add a6, a6, a7
+; RV64ZVBC64-NEXT:    add a6, sp, a6
+; RV64ZVBC64-NEXT:    addi a6, a6, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, s3
+; RV64ZVBC64-NEXT:    csrr a6, vlenb
+; RV64ZVBC64-NEXT:    slli a6, a6, 5
+; RV64ZVBC64-NEXT:    mv a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 2
+; RV64ZVBC64-NEXT:    add a6, a6, a7
+; RV64ZVBC64-NEXT:    add a6, sp, a6
+; RV64ZVBC64-NEXT:    addi a6, a6, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, s4
+; RV64ZVBC64-NEXT:    csrr a6, vlenb
+; RV64ZVBC64-NEXT:    slli a6, a6, 3
+; RV64ZVBC64-NEXT:    mv a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 1
+; RV64ZVBC64-NEXT:    add a7, a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 3
+; RV64ZVBC64-NEXT:    add a6, a6, a7
+; RV64ZVBC64-NEXT:    add a6, sp, a6
+; RV64ZVBC64-NEXT:    addi a6, a6, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, s7
+; RV64ZVBC64-NEXT:    csrr a6, vlenb
+; RV64ZVBC64-NEXT:    slli a6, a6, 4
+; RV64ZVBC64-NEXT:    mv a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 3
+; RV64ZVBC64-NEXT:    add a6, a6, a7
+; RV64ZVBC64-NEXT:    add a6, sp, a6
+; RV64ZVBC64-NEXT:    addi a6, a6, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, s8
+; RV64ZVBC64-NEXT:    csrr a6, vlenb
+; RV64ZVBC64-NEXT:    slli a6, a6, 3
+; RV64ZVBC64-NEXT:    mv a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 4
+; RV64ZVBC64-NEXT:    add a6, a6, a7
+; RV64ZVBC64-NEXT:    add a6, sp, a6
+; RV64ZVBC64-NEXT:    addi a6, a6, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, s9
+; RV64ZVBC64-NEXT:    csrr a6, vlenb
+; RV64ZVBC64-NEXT:    slli a6, a6, 7
+; RV64ZVBC64-NEXT:    add a6, sp, a6
+; RV64ZVBC64-NEXT:    addi a6, a6, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, s10
+; RV64ZVBC64-NEXT:    csrr a6, vlenb
+; RV64ZVBC64-NEXT:    slli a6, a6, 3
+; RV64ZVBC64-NEXT:    mv a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 1
+; RV64ZVBC64-NEXT:    add a7, a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 1
+; RV64ZVBC64-NEXT:    add a7, a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 1
+; RV64ZVBC64-NEXT:    add a6, a6, a7
+; RV64ZVBC64-NEXT:    add a6, sp, a6
+; RV64ZVBC64-NEXT:    addi a6, a6, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, s11
+; RV64ZVBC64-NEXT:    csrr a6, vlenb
+; RV64ZVBC64-NEXT:    slli a6, a6, 4
+; RV64ZVBC64-NEXT:    mv a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 1
+; RV64ZVBC64-NEXT:    add a7, a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 1
+; RV64ZVBC64-NEXT:    add a6, a6, a7
+; RV64ZVBC64-NEXT:    add a6, sp, a6
+; RV64ZVBC64-NEXT:    addi a6, a6, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, ra
+; RV64ZVBC64-NEXT:    csrr a6, vlenb
+; RV64ZVBC64-NEXT:    slli a6, a6, 3
+; RV64ZVBC64-NEXT:    mv a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 2
+; RV64ZVBC64-NEXT:    add a7, a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 1
+; RV64ZVBC64-NEXT:    add a6, a6, a7
+; RV64ZVBC64-NEXT:    add a6, sp, a6
+; RV64ZVBC64-NEXT:    addi a6, a6, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, a5
+; RV64ZVBC64-NEXT:    csrr a5, vlenb
+; RV64ZVBC64-NEXT:    slli a5, a5, 5
+; RV64ZVBC64-NEXT:    mv a6, a5
+; RV64ZVBC64-NEXT:    slli a5, a5, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a6
+; RV64ZVBC64-NEXT:    add a5, sp, a5
+; RV64ZVBC64-NEXT:    addi a5, a5, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a5) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, t6
+; RV64ZVBC64-NEXT:    csrr a5, vlenb
+; RV64ZVBC64-NEXT:    slli a5, a5, 3
+; RV64ZVBC64-NEXT:    mv a6, a5
+; RV64ZVBC64-NEXT:    slli a5, a5, 1
+; RV64ZVBC64-NEXT:    add a6, a6, a5
+; RV64ZVBC64-NEXT:    slli a5, a5, 2
+; RV64ZVBC64-NEXT:    add a5, a5, a6
+; RV64ZVBC64-NEXT:    add a5, sp, a5
+; RV64ZVBC64-NEXT:    addi a5, a5, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a5) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, a4
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 4
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 2
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vi v16, v0, 2
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vi v24, v0, 1
+; RV64ZVBC64-NEXT:    vand.vi v16, v0, 4
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 2
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vi v16, v0, 8
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, t2
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 6
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, s5
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v24
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 4
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 2
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v24, v8, v16
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v16
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 4
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 4
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 2
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 4
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 5
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 8
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 4
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 4
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    addi a4, sp, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 2
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 2
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 5
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 5
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 2
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 2
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 4
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 2
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 4
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 2
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 6
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 6
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 2
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 2
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 4
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 2
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 4
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 2
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 2
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 2
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 2
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 2
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 5
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 2
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 5
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 2
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 4
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 4
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 4
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 4
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 7
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 7
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 4
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 4
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 2
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 4
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 5
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 4
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 4
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 2
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 5
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 4
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 2
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 8
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 6
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v8, v8, v16
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 4
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vs8r.v v8, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 4
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v16, v16, v8
+; RV64ZVBC64-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC64-NEXT:    vxor.vv v16, v16, v0
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 2
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v16, v16, v8
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v16, v16, v8
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v16, v16, v8
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 4
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v16, v16, v8
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v24, v16, v8
+; RV64ZVBC64-NEXT:    addi a4, sp, 32
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v8
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 2
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v8
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 5
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v8
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 2
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v8
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 4
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 2
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v8
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v8
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 6
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v8
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 2
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v8
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 4
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 2
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v8
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 2
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 2
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v8
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 5
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 2
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v8
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v8
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 4
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v8
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 4
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v8
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 7
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v8
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v0, v24, v8
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 4
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v0, v0, v8
+; RV64ZVBC64-NEXT:    vsll.vi v16, v16, 24
+; RV64ZVBC64-NEXT:    vand.vx v8, v24, a0
+; RV64ZVBC64-NEXT:    vsll.vi v8, v8, 8
+; RV64ZVBC64-NEXT:    vor.vv v16, v16, v8
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 4
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v8, v0, v8
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 4
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 4
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v8, v8, v0
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 5
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v8, v8, v0
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 8
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v8, v8, v0
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v8, v8, v0
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 4
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v8, v8, v0
+; RV64ZVBC64-NEXT:    vsrl.vi v24, v24, 8
+; RV64ZVBC64-NEXT:    vand.vx v24, v24, a0
+; RV64ZVBC64-NEXT:    vsrl.vi v8, v8, 24
+; RV64ZVBC64-NEXT:    vor.vv v8, v24, v8
+; RV64ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV64ZVBC64-NEXT:    vsrl.vi v16, v8, 4
+; RV64ZVBC64-NEXT:    vand.vx v8, v8, a3
+; RV64ZVBC64-NEXT:    vand.vx v16, v16, a3
+; RV64ZVBC64-NEXT:    vsll.vi v8, v8, 4
+; RV64ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV64ZVBC64-NEXT:    vsrl.vi v16, v8, 2
+; RV64ZVBC64-NEXT:    vand.vx v8, v8, a2
+; RV64ZVBC64-NEXT:    vand.vx v16, v16, a2
+; RV64ZVBC64-NEXT:    vsll.vi v8, v8, 2
+; RV64ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV64ZVBC64-NEXT:    vsrl.vi v16, v8, 1
+; RV64ZVBC64-NEXT:    vand.vx v8, v8, a1
+; RV64ZVBC64-NEXT:    vand.vx v16, v16, a1
+; RV64ZVBC64-NEXT:    vadd.vv v8, v8, v8
+; RV64ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV64ZVBC64-NEXT:    vsrl.vi v8, v8, 1
+; RV64ZVBC64-NEXT:    csrr a0, vlenb
+; RV64ZVBC64-NEXT:    slli a0, a0, 5
+; RV64ZVBC64-NEXT:    mv a1, a0
+; RV64ZVBC64-NEXT:    slli a0, a0, 3
+; RV64ZVBC64-NEXT:    add a0, a0, a1
+; RV64ZVBC64-NEXT:    add sp, sp, a0
+; RV64ZVBC64-NEXT:    ld ra, 136(sp) # 8-byte Folded Reload
+; RV64ZVBC64-NEXT:    ld s0, 128(sp) # 8-byte Folded Reload
+; RV64ZVBC64-NEXT:    ld s1, 120(sp) # 8-byte Folded Reload
+; RV64ZVBC64-NEXT:    ld s2, 112(sp) # 8-byte Folded Reload
+; RV64ZVBC64-NEXT:    ld s3, 104(sp) # 8-byte Folded Reload
+; RV64ZVBC64-NEXT:    ld s4, 96(sp) # 8-byte Folded Reload
+; RV64ZVBC64-NEXT:    ld s5, 88(sp) # 8-byte Folded Reload
+; RV64ZVBC64-NEXT:    ld s6, 80(sp) # 8-byte Folded Reload
+; RV64ZVBC64-NEXT:    ld s7, 72(sp) # 8-byte Folded Reload
+; RV64ZVBC64-NEXT:    ld s8, 64(sp) # 8-byte Folded Reload
+; RV64ZVBC64-NEXT:    ld s9, 56(sp) # 8-byte Folded Reload
+; RV64ZVBC64-NEXT:    ld s10, 48(sp) # 8-byte Folded Reload
+; RV64ZVBC64-NEXT:    ld s11, 40(sp) # 8-byte Folded Reload
+; RV64ZVBC64-NEXT:    addi sp, sp, 144
+; RV64ZVBC64-NEXT:    ret
+;
+; RV32ZVBC32-LABEL: clmulh_nxv16i32_vv:
+; RV32ZVBC32:       # %bb.0:
+; RV32ZVBC32-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; RV32ZVBC32-NEXT:    vclmulh.vv v8, v8, v16
+; RV32ZVBC32-NEXT:    ret
+;
+; RV64ZVBC32-LABEL: clmulh_nxv16i32_vv:
+; RV64ZVBC32:       # %bb.0:
+; RV64ZVBC32-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; RV64ZVBC32-NEXT:    vclmulh.vv v8, v8, v16
+; RV64ZVBC32-NEXT:    ret
+  %va.ext = zext <vscale x 16 x i32> %va to <vscale x 16 x i64>
+  %vb.ext = zext <vscale x 16 x i32> %vb to <vscale x 16 x i64>
+  %clmul = call <vscale x 16 x i64> @llvm.clmul.nxv16i64(<vscale x 16 x i64> %va.ext, <vscale x 16 x i64> %vb.ext)
+  %res.ext = lshr <vscale x 16 x i64> %clmul, splat(i64 32)
+  %res = trunc <vscale x 16 x i64> %res.ext to <vscale x 16 x i32>
+  ret <vscale x 16 x i32> %res
+}
+
+define <vscale x 16 x i32> @clmulh_nxv16i32_vx(<vscale x 16 x i32> %va, i32 %b) nounwind {
+; RV32V-LABEL: clmulh_nxv16i32_vx:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    addi sp, sp, -16
+; RV32V-NEXT:    csrr a1, vlenb
+; RV32V-NEXT:    slli a1, a1, 4
+; RV32V-NEXT:    sub sp, sp, a1
+; RV32V-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32V-NEXT:    vsrl.vi v16, v8, 8
+; RV32V-NEXT:    lui a4, 16
+; RV32V-NEXT:    vsrl.vi v24, v8, 24
+; RV32V-NEXT:    vsll.vi v0, v8, 24
+; RV32V-NEXT:    lui a2, 61681
+; RV32V-NEXT:    lui a5, 209715
+; RV32V-NEXT:    lui a7, 349525
+; RV32V-NEXT:    srli a3, a0, 8
+; RV32V-NEXT:    srli a6, a0, 24
+; RV32V-NEXT:    addi a1, a4, -256
+; RV32V-NEXT:    and a3, a3, a1
+; RV32V-NEXT:    or t0, a3, a6
+; RV32V-NEXT:    slli a3, a0, 24
+; RV32V-NEXT:    and a0, a0, a1
+; RV32V-NEXT:    slli a0, a0, 8
+; RV32V-NEXT:    or t1, a3, a0
+; RV32V-NEXT:    li a6, 1
+; RV32V-NEXT:    addi a3, a2, -241
+; RV32V-NEXT:    addi a2, a5, 819
+; RV32V-NEXT:    addi a0, a7, 1365
+; RV32V-NEXT:    vand.vx v16, v16, a1
+; RV32V-NEXT:    vand.vx v8, v8, a1
+; RV32V-NEXT:    vor.vv v16, v16, v24
+; RV32V-NEXT:    vsll.vi v8, v8, 8
+; RV32V-NEXT:    vor.vv v8, v0, v8
+; RV32V-NEXT:    vor.vv v8, v8, v16
+; RV32V-NEXT:    or a5, t1, t0
+; RV32V-NEXT:    vsrl.vi v16, v8, 4
+; RV32V-NEXT:    vand.vx v8, v8, a3
+; RV32V-NEXT:    srli a7, a5, 4
+; RV32V-NEXT:    and a5, a5, a3
+; RV32V-NEXT:    vand.vx v16, v16, a3
+; RV32V-NEXT:    vsll.vi v8, v8, 4
+; RV32V-NEXT:    and a7, a7, a3
+; RV32V-NEXT:    slli a5, a5, 4
+; RV32V-NEXT:    vor.vv v8, v16, v8
+; RV32V-NEXT:    or a5, a7, a5
+; RV32V-NEXT:    vsrl.vi v16, v8, 2
+; RV32V-NEXT:    vand.vx v8, v8, a2
+; RV32V-NEXT:    srli a7, a5, 2
+; RV32V-NEXT:    and a5, a5, a2
+; RV32V-NEXT:    vand.vx v16, v16, a2
+; RV32V-NEXT:    vsll.vi v8, v8, 2
+; RV32V-NEXT:    and a7, a7, a2
+; RV32V-NEXT:    slli a5, a5, 2
+; RV32V-NEXT:    vor.vv v8, v16, v8
+; RV32V-NEXT:    or a5, a7, a5
+; RV32V-NEXT:    vsrl.vi v16, v8, 1
+; RV32V-NEXT:    vand.vx v8, v8, a0
+; RV32V-NEXT:    srli a7, a5, 1
+; RV32V-NEXT:    and a5, a5, a0
+; RV32V-NEXT:    vand.vx v16, v16, a0
+; RV32V-NEXT:    vadd.vv v8, v8, v8
+; RV32V-NEXT:    and a7, a7, a0
+; RV32V-NEXT:    slli a5, a5, 1
+; RV32V-NEXT:    vor.vv v8, v16, v8
+; RV32V-NEXT:    or a5, a7, a5
+; RV32V-NEXT:    andi a7, a5, 2
+; RV32V-NEXT:    vmul.vx v16, v8, a7
+; RV32V-NEXT:    andi a7, a5, 1
+; RV32V-NEXT:    vmul.vx v24, v8, a7
+; RV32V-NEXT:    andi a7, a5, 4
+; RV32V-NEXT:    vmul.vx v0, v8, a7
+; RV32V-NEXT:    andi a7, a5, 8
+; RV32V-NEXT:    vxor.vv v16, v24, v16
+; RV32V-NEXT:    vmul.vx v24, v8, a7
+; RV32V-NEXT:    andi a7, a5, 16
+; RV32V-NEXT:    vxor.vv v16, v16, v0
+; RV32V-NEXT:    vmul.vx v0, v8, a7
+; RV32V-NEXT:    andi a7, a5, 32
+; RV32V-NEXT:    vxor.vv v16, v16, v24
+; RV32V-NEXT:    vmul.vx v24, v8, a7
+; RV32V-NEXT:    andi a7, a5, 64
+; RV32V-NEXT:    vxor.vv v16, v16, v0
+; RV32V-NEXT:    vmul.vx v0, v8, a7
+; RV32V-NEXT:    andi a7, a5, 128
+; RV32V-NEXT:    vxor.vv v16, v16, v24
+; RV32V-NEXT:    vmul.vx v24, v8, a7
+; RV32V-NEXT:    andi a7, a5, 256
+; RV32V-NEXT:    vxor.vv v16, v16, v0
+; RV32V-NEXT:    vmul.vx v0, v8, a7
+; RV32V-NEXT:    andi a7, a5, 512
+; RV32V-NEXT:    vxor.vv v16, v16, v24
+; RV32V-NEXT:    csrr t0, vlenb
+; RV32V-NEXT:    slli t0, t0, 3
+; RV32V-NEXT:    add t0, sp, t0
+; RV32V-NEXT:    addi t0, t0, 16
+; RV32V-NEXT:    vs8r.v v16, (t0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vmul.vx v24, v8, a7
+; RV32V-NEXT:    andi a7, a5, 1024
+; RV32V-NEXT:    vxor.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v24, v0, v24
+; RV32V-NEXT:    vmul.vx v0, v8, a7
+; RV32V-NEXT:    lui a7, 1
+; RV32V-NEXT:    slli a6, a6, 11
+; RV32V-NEXT:    and a6, a5, a6
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vmul.vx v0, v8, a6
+; RV32V-NEXT:    lui a6, 2
+; RV32V-NEXT:    and a7, a5, a7
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vmul.vx v0, v8, a7
+; RV32V-NEXT:    lui a7, 4
+; RV32V-NEXT:    and a6, a5, a6
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vmul.vx v0, v8, a6
+; RV32V-NEXT:    lui a6, 8
+; RV32V-NEXT:    and a7, a5, a7
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vmul.vx v0, v8, a7
+; RV32V-NEXT:    lui a7, 32
+; RV32V-NEXT:    and a6, a5, a6
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vmul.vx v0, v8, a6
+; RV32V-NEXT:    lui a6, 64
+; RV32V-NEXT:    and a4, a5, a4
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vmul.vx v0, v8, a4
+; RV32V-NEXT:    lui a4, 128
+; RV32V-NEXT:    and a7, a5, a7
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vmul.vx v0, v8, a7
+; RV32V-NEXT:    lui a7, 256
+; RV32V-NEXT:    and a6, a5, a6
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vmul.vx v0, v8, a6
+; RV32V-NEXT:    lui a6, 512
+; RV32V-NEXT:    and a4, a5, a4
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vmul.vx v0, v8, a4
+; RV32V-NEXT:    lui a4, 1024
+; RV32V-NEXT:    and a7, a5, a7
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vmul.vx v0, v8, a7
+; RV32V-NEXT:    lui a7, 2048
+; RV32V-NEXT:    and a6, a5, a6
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vmul.vx v0, v8, a6
+; RV32V-NEXT:    lui a6, 4096
+; RV32V-NEXT:    and a4, a5, a4
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vmul.vx v0, v8, a4
+; RV32V-NEXT:    lui a4, 8192
+; RV32V-NEXT:    and a7, a5, a7
+; RV32V-NEXT:    and a6, a5, a6
+; RV32V-NEXT:    and a4, a5, a4
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vmul.vx v0, v8, a7
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vmul.vx v0, v8, a6
+; RV32V-NEXT:    vxor.vv v0, v24, v0
+; RV32V-NEXT:    vmul.vx v16, v8, a4
+; RV32V-NEXT:    vxor.vv v16, v0, v16
+; RV32V-NEXT:    addi a4, sp, 16
+; RV32V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vsll.vi v16, v16, 24
+; RV32V-NEXT:    vand.vx v0, v24, a1
+; RV32V-NEXT:    vsll.vi v0, v0, 8
+; RV32V-NEXT:    vor.vv v16, v16, v0
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 16
+; RV32V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    lui a4, 16384
+; RV32V-NEXT:    and a4, a5, a4
+; RV32V-NEXT:    vmul.vx v0, v8, a4
+; RV32V-NEXT:    addi a4, sp, 16
+; RV32V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v0, v16, v0
+; RV32V-NEXT:    lui a4, 32768
+; RV32V-NEXT:    and a4, a5, a4
+; RV32V-NEXT:    vmul.vx v16, v8, a4
+; RV32V-NEXT:    vxor.vv v16, v0, v16
+; RV32V-NEXT:    lui a4, 65536
+; RV32V-NEXT:    and a4, a5, a4
+; RV32V-NEXT:    vmul.vx v0, v8, a4
+; RV32V-NEXT:    vxor.vv v16, v16, v0
+; RV32V-NEXT:    lui a4, 131072
+; RV32V-NEXT:    and a4, a5, a4
+; RV32V-NEXT:    vmul.vx v0, v8, a4
+; RV32V-NEXT:    vxor.vv v16, v16, v0
+; RV32V-NEXT:    lui a4, 262144
+; RV32V-NEXT:    and a4, a5, a4
+; RV32V-NEXT:    vmul.vx v0, v8, a4
+; RV32V-NEXT:    vxor.vv v16, v16, v0
+; RV32V-NEXT:    lui a4, 524288
+; RV32V-NEXT:    and a4, a5, a4
+; RV32V-NEXT:    vmul.vx v8, v8, a4
+; RV32V-NEXT:    vxor.vv v8, v16, v8
+; RV32V-NEXT:    vsrl.vi v16, v24, 8
+; RV32V-NEXT:    vand.vx v16, v16, a1
+; RV32V-NEXT:    vsrl.vi v8, v8, 24
+; RV32V-NEXT:    vor.vv v8, v16, v8
+; RV32V-NEXT:    csrr a1, vlenb
+; RV32V-NEXT:    slli a1, a1, 3
+; RV32V-NEXT:    add a1, sp, a1
+; RV32V-NEXT:    addi a1, a1, 16
+; RV32V-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vor.vv v8, v16, v8
+; RV32V-NEXT:    vsrl.vi v16, v8, 4
+; RV32V-NEXT:    vand.vx v8, v8, a3
+; RV32V-NEXT:    vand.vx v16, v16, a3
+; RV32V-NEXT:    vsll.vi v8, v8, 4
+; RV32V-NEXT:    vor.vv v8, v16, v8
+; RV32V-NEXT:    vsrl.vi v16, v8, 2
+; RV32V-NEXT:    vand.vx v8, v8, a2
+; RV32V-NEXT:    vand.vx v16, v16, a2
+; RV32V-NEXT:    vsll.vi v8, v8, 2
+; RV32V-NEXT:    vor.vv v8, v16, v8
+; RV32V-NEXT:    vsrl.vi v16, v8, 1
+; RV32V-NEXT:    vand.vx v8, v8, a0
+; RV32V-NEXT:    vand.vx v16, v16, a0
+; RV32V-NEXT:    vadd.vv v8, v8, v8
+; RV32V-NEXT:    vor.vv v8, v16, v8
+; RV32V-NEXT:    vsrl.vi v8, v8, 1
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add sp, sp, a0
+; RV32V-NEXT:    addi sp, sp, 16
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: clmulh_nxv16i32_vx:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    addi sp, sp, -144
+; RV64V-NEXT:    sd ra, 136(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s0, 128(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s1, 120(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s2, 112(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s3, 104(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s4, 96(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s5, 88(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s6, 80(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s7, 72(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s8, 64(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s9, 56(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s10, 48(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s11, 40(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    csrr a1, vlenb
+; RV64V-NEXT:    slli a1, a1, 5
+; RV64V-NEXT:    mv a2, a1
+; RV64V-NEXT:    slli a1, a1, 3
+; RV64V-NEXT:    add a1, a1, a2
+; RV64V-NEXT:    sub sp, sp, a1
+; RV64V-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV64V-NEXT:    vmv.v.x v0, a0
+; RV64V-NEXT:    vsrl.vi v16, v8, 8
+; RV64V-NEXT:    lui a0, 16
+; RV64V-NEXT:    vsrl.vi v24, v8, 24
+; RV64V-NEXT:    addi a0, a0, -256
+; RV64V-NEXT:    vand.vx v16, v16, a0
+; RV64V-NEXT:    vor.vv v16, v16, v24
+; RV64V-NEXT:    csrr a1, vlenb
+; RV64V-NEXT:    slli a1, a1, 3
+; RV64V-NEXT:    mv a2, a1
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    add a2, a2, a1
+; RV64V-NEXT:    slli a1, a1, 4
+; RV64V-NEXT:    add a1, a1, a2
+; RV64V-NEXT:    add a1, sp, a1
+; RV64V-NEXT:    addi a1, a1, 32
+; RV64V-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vsrl.vi v24, v0, 8
+; RV64V-NEXT:    vand.vx v24, v24, a0
+; RV64V-NEXT:    vsrl.vi v16, v0, 24
+; RV64V-NEXT:    vor.vv v24, v24, v16
+; RV64V-NEXT:    vsll.vi v16, v8, 24
+; RV64V-NEXT:    vand.vx v8, v8, a0
+; RV64V-NEXT:    vsll.vi v8, v8, 8
+; RV64V-NEXT:    vor.vv v8, v16, v8
+; RV64V-NEXT:    vsll.vi v16, v0, 24
+; RV64V-NEXT:    vand.vx v0, v0, a0
+; RV64V-NEXT:    vsll.vi v0, v0, 8
+; RV64V-NEXT:    vor.vv v0, v16, v0
+; RV64V-NEXT:    lui a1, 61681
+; RV64V-NEXT:    lui a2, 209715
+; RV64V-NEXT:    lui t6, 349525
+; RV64V-NEXT:    li s5, 16
+; RV64V-NEXT:    li t2, 32
+; RV64V-NEXT:    li a7, 256
+; RV64V-NEXT:    li t0, 512
+; RV64V-NEXT:    li t1, 1024
+; RV64V-NEXT:    li s6, 1
+; RV64V-NEXT:    lui t3, 1
+; RV64V-NEXT:    lui t4, 2
+; RV64V-NEXT:    lui t5, 4
+; RV64V-NEXT:    lui s0, 8
+; RV64V-NEXT:    lui s1, 32
+; RV64V-NEXT:    lui s2, 64
+; RV64V-NEXT:    lui s3, 128
+; RV64V-NEXT:    lui s4, 256
+; RV64V-NEXT:    lui s7, 512
+; RV64V-NEXT:    lui s8, 1024
+; RV64V-NEXT:    lui s9, 2048
+; RV64V-NEXT:    lui s10, 4096
+; RV64V-NEXT:    lui s11, 8192
+; RV64V-NEXT:    lui ra, 16384
+; RV64V-NEXT:    lui a5, 32768
+; RV64V-NEXT:    addi a3, a1, -241
+; RV64V-NEXT:    addi a2, a2, 819
+; RV64V-NEXT:    addi a1, t6, 1365
+; RV64V-NEXT:    vor.vv v16, v0, v24
+; RV64V-NEXT:    vsrl.vi v24, v16, 4
+; RV64V-NEXT:    vand.vx v16, v16, a3
+; RV64V-NEXT:    vand.vx v24, v24, a3
+; RV64V-NEXT:    vsll.vi v16, v16, 4
+; RV64V-NEXT:    vor.vv v16, v24, v16
+; RV64V-NEXT:    vsrl.vi v24, v16, 2
+; RV64V-NEXT:    vand.vx v16, v16, a2
+; RV64V-NEXT:    vand.vx v24, v24, a2
+; RV64V-NEXT:    vsll.vi v16, v16, 2
+; RV64V-NEXT:    vor.vv v16, v24, v16
+; RV64V-NEXT:    vsrl.vi v24, v16, 1
+; RV64V-NEXT:    vand.vx v16, v16, a1
+; RV64V-NEXT:    vand.vx v24, v24, a1
+; RV64V-NEXT:    vadd.vv v16, v16, v16
+; RV64V-NEXT:    vor.vv v0, v24, v16
+; RV64V-NEXT:    vand.vx v16, v0, s5
+; RV64V-NEXT:    sd a0, 8(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    mv a4, a0
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    add a0, a0, a4
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 32
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    lui t6, 65536
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a4, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a4, a4, a0
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    add a0, a0, a4
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 32
+; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vor.vv v8, v8, v16
+; RV64V-NEXT:    vsrl.vi v16, v8, 4
+; RV64V-NEXT:    vand.vx v8, v8, a3
+; RV64V-NEXT:    vand.vx v16, v16, a3
+; RV64V-NEXT:    vsll.vi v8, v8, 4
+; RV64V-NEXT:    vor.vv v8, v16, v8
+; RV64V-NEXT:    vsrl.vi v16, v8, 2
+; RV64V-NEXT:    vand.vx v8, v8, a2
+; RV64V-NEXT:    vand.vx v16, v16, a2
+; RV64V-NEXT:    vsll.vi v8, v8, 2
+; RV64V-NEXT:    vor.vv v8, v16, v8
+; RV64V-NEXT:    vsrl.vi v16, v8, 1
+; RV64V-NEXT:    vand.vx v8, v8, a1
+; RV64V-NEXT:    vand.vx v16, v16, a1
+; RV64V-NEXT:    vadd.vv v8, v8, v8
+; RV64V-NEXT:    vor.vv v8, v16, v8
+; RV64V-NEXT:    vand.vx v16, v0, t2
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a4, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a4, a4, a0
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    add a0, a0, a4
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 32
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    lui a4, 131072
+; RV64V-NEXT:    slli s6, s6, 11
+; RV64V-NEXT:    lui t2, 262144
+; RV64V-NEXT:    lui s5, 524288
+; RV64V-NEXT:    li a6, 64
+; RV64V-NEXT:    vand.vx v16, v0, a6
+; RV64V-NEXT:    csrr a6, vlenb
+; RV64V-NEXT:    slli a6, a6, 3
+; RV64V-NEXT:    mv a0, a6
+; RV64V-NEXT:    slli a6, a6, 5
+; RV64V-NEXT:    add a6, a6, a0
+; RV64V-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    add a6, sp, a6
+; RV64V-NEXT:    addi a6, a6, 32
+; RV64V-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    li a6, 128
+; RV64V-NEXT:    vand.vx v16, v0, a6
+; RV64V-NEXT:    csrr a6, vlenb
+; RV64V-NEXT:    slli a6, a6, 8
+; RV64V-NEXT:    add a6, sp, a6
+; RV64V-NEXT:    addi a6, a6, 32
+; RV64V-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, a7
+; RV64V-NEXT:    csrr a6, vlenb
+; RV64V-NEXT:    slli a6, a6, 3
+; RV64V-NEXT:    mv a7, a6
+; RV64V-NEXT:    slli a6, a6, 1
+; RV64V-NEXT:    add a7, a7, a6
+; RV64V-NEXT:    slli a6, a6, 1
+; RV64V-NEXT:    add a7, a7, a6
+; RV64V-NEXT:    slli a6, a6, 1
+; RV64V-NEXT:    add a7, a7, a6
+; RV64V-NEXT:    slli a6, a6, 1
+; RV64V-NEXT:    add a6, a6, a7
+; RV64V-NEXT:    add a6, sp, a6
+; RV64V-NEXT:    addi a6, a6, 32
+; RV64V-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, t0
+; RV64V-NEXT:    csrr a6, vlenb
+; RV64V-NEXT:    slli a6, a6, 4
+; RV64V-NEXT:    mv a7, a6
+; RV64V-NEXT:    slli a6, a6, 1
+; RV64V-NEXT:    add a7, a7, a6
+; RV64V-NEXT:    slli a6, a6, 1
+; RV64V-NEXT:    add a7, a7, a6
+; RV64V-NEXT:    slli a6, a6, 1
+; RV64V-NEXT:    add a6, a6, a7
+; RV64V-NEXT:    add a6, sp, a6
+; RV64V-NEXT:    addi a6, a6, 32
+; RV64V-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, t1
+; RV64V-NEXT:    csrr a6, vlenb
+; RV64V-NEXT:    slli a6, a6, 3
+; RV64V-NEXT:    mv a7, a6
+; RV64V-NEXT:    slli a6, a6, 2
+; RV64V-NEXT:    add a7, a7, a6
+; RV64V-NEXT:    slli a6, a6, 1
+; RV64V-NEXT:    add a7, a7, a6
+; RV64V-NEXT:    slli a6, a6, 1
+; RV64V-NEXT:    add a6, a6, a7
+; RV64V-NEXT:    add a6, sp, a6
+; RV64V-NEXT:    addi a6, a6, 32
+; RV64V-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, s6
+; RV64V-NEXT:    csrr a6, vlenb
+; RV64V-NEXT:    slli a6, a6, 5
+; RV64V-NEXT:    mv a7, a6
+; RV64V-NEXT:    slli a6, a6, 1
+; RV64V-NEXT:    add a7, a7, a6
+; RV64V-NEXT:    slli a6, a6, 1
+; RV64V-NEXT:    add a6, a6, a7
+; RV64V-NEXT:    add a6, sp, a6
+; RV64V-NEXT:    addi a6, a6, 32
+; RV64V-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, t3
+; RV64V-NEXT:    csrr a6, vlenb
+; RV64V-NEXT:    slli a6, a6, 3
+; RV64V-NEXT:    mv a7, a6
+; RV64V-NEXT:    slli a6, a6, 1
+; RV64V-NEXT:    add a7, a7, a6
+; RV64V-NEXT:    slli a6, a6, 2
+; RV64V-NEXT:    add a7, a7, a6
+; RV64V-NEXT:    slli a6, a6, 1
+; RV64V-NEXT:    add a6, a6, a7
+; RV64V-NEXT:    add a6, sp, a6
+; RV64V-NEXT:    addi a6, a6, 32
+; RV64V-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, t4
+; RV64V-NEXT:    csrr a6, vlenb
+; RV64V-NEXT:    slli a6, a6, 4
+; RV64V-NEXT:    mv a7, a6
+; RV64V-NEXT:    slli a6, a6, 2
+; RV64V-NEXT:    add a7, a7, a6
+; RV64V-NEXT:    slli a6, a6, 1
+; RV64V-NEXT:    add a6, a6, a7
+; RV64V-NEXT:    add a6, sp, a6
+; RV64V-NEXT:    addi a6, a6, 32
+; RV64V-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, t5
+; RV64V-NEXT:    csrr a6, vlenb
+; RV64V-NEXT:    slli a6, a6, 3
+; RV64V-NEXT:    mv a7, a6
+; RV64V-NEXT:    slli a6, a6, 3
+; RV64V-NEXT:    add a7, a7, a6
+; RV64V-NEXT:    slli a6, a6, 1
+; RV64V-NEXT:    add a6, a6, a7
+; RV64V-NEXT:    add a6, sp, a6
+; RV64V-NEXT:    addi a6, a6, 32
+; RV64V-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, s0
+; RV64V-NEXT:    csrr a6, vlenb
+; RV64V-NEXT:    slli a6, a6, 6
+; RV64V-NEXT:    mv a7, a6
+; RV64V-NEXT:    slli a6, a6, 1
+; RV64V-NEXT:    add a6, a6, a7
+; RV64V-NEXT:    add a6, sp, a6
+; RV64V-NEXT:    addi a6, a6, 32
+; RV64V-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    lui a6, 16
+; RV64V-NEXT:    vand.vx v16, v0, a6
+; RV64V-NEXT:    csrr a6, vlenb
+; RV64V-NEXT:    slli a6, a6, 3
+; RV64V-NEXT:    mv a7, a6
+; RV64V-NEXT:    slli a6, a6, 1
+; RV64V-NEXT:    add a7, a7, a6
+; RV64V-NEXT:    slli a6, a6, 1
+; RV64V-NEXT:    add a7, a7, a6
+; RV64V-NEXT:    slli a6, a6, 2
+; RV64V-NEXT:    add a6, a6, a7
+; RV64V-NEXT:    add a6, sp, a6
+; RV64V-NEXT:    addi a6, a6, 32
+; RV64V-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, s1
+; RV64V-NEXT:    csrr a6, vlenb
+; RV64V-NEXT:    slli a6, a6, 4
+; RV64V-NEXT:    mv a7, a6
+; RV64V-NEXT:    slli a6, a6, 1
+; RV64V-NEXT:    add a7, a7, a6
+; RV64V-NEXT:    slli a6, a6, 2
+; RV64V-NEXT:    add a6, a6, a7
+; RV64V-NEXT:    add a6, sp, a6
+; RV64V-NEXT:    addi a6, a6, 32
+; RV64V-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, s2
+; RV64V-NEXT:    csrr a6, vlenb
+; RV64V-NEXT:    slli a6, a6, 3
+; RV64V-NEXT:    mv a7, a6
+; RV64V-NEXT:    slli a6, a6, 2
+; RV64V-NEXT:    add a7, a7, a6
+; RV64V-NEXT:    slli a6, a6, 2
+; RV64V-NEXT:    add a6, a6, a7
+; RV64V-NEXT:    add a6, sp, a6
+; RV64V-NEXT:    addi a6, a6, 32
+; RV64V-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, s3
+; RV64V-NEXT:    csrr a6, vlenb
+; RV64V-NEXT:    slli a6, a6, 5
+; RV64V-NEXT:    mv a7, a6
+; RV64V-NEXT:    slli a6, a6, 2
+; RV64V-NEXT:    add a6, a6, a7
+; RV64V-NEXT:    add a6, sp, a6
+; RV64V-NEXT:    addi a6, a6, 32
+; RV64V-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, s4
+; RV64V-NEXT:    csrr a6, vlenb
+; RV64V-NEXT:    slli a6, a6, 3
+; RV64V-NEXT:    mv a7, a6
+; RV64V-NEXT:    slli a6, a6, 1
+; RV64V-NEXT:    add a7, a7, a6
+; RV64V-NEXT:    slli a6, a6, 3
+; RV64V-NEXT:    add a6, a6, a7
+; RV64V-NEXT:    add a6, sp, a6
+; RV64V-NEXT:    addi a6, a6, 32
+; RV64V-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, s7
+; RV64V-NEXT:    csrr a6, vlenb
+; RV64V-NEXT:    slli a6, a6, 4
+; RV64V-NEXT:    mv a7, a6
+; RV64V-NEXT:    slli a6, a6, 3
+; RV64V-NEXT:    add a6, a6, a7
+; RV64V-NEXT:    add a6, sp, a6
+; RV64V-NEXT:    addi a6, a6, 32
+; RV64V-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, s8
+; RV64V-NEXT:    csrr a6, vlenb
+; RV64V-NEXT:    slli a6, a6, 3
+; RV64V-NEXT:    mv a7, a6
+; RV64V-NEXT:    slli a6, a6, 4
+; RV64V-NEXT:    add a6, a6, a7
+; RV64V-NEXT:    add a6, sp, a6
+; RV64V-NEXT:    addi a6, a6, 32
+; RV64V-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, s9
+; RV64V-NEXT:    csrr a6, vlenb
+; RV64V-NEXT:    slli a6, a6, 7
+; RV64V-NEXT:    add a6, sp, a6
+; RV64V-NEXT:    addi a6, a6, 32
+; RV64V-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, s10
+; RV64V-NEXT:    csrr a6, vlenb
+; RV64V-NEXT:    slli a6, a6, 3
+; RV64V-NEXT:    mv a7, a6
+; RV64V-NEXT:    slli a6, a6, 1
+; RV64V-NEXT:    add a7, a7, a6
+; RV64V-NEXT:    slli a6, a6, 1
+; RV64V-NEXT:    add a7, a7, a6
+; RV64V-NEXT:    slli a6, a6, 1
+; RV64V-NEXT:    add a6, a6, a7
+; RV64V-NEXT:    add a6, sp, a6
+; RV64V-NEXT:    addi a6, a6, 32
+; RV64V-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, s11
+; RV64V-NEXT:    csrr a6, vlenb
+; RV64V-NEXT:    slli a6, a6, 4
+; RV64V-NEXT:    mv a7, a6
+; RV64V-NEXT:    slli a6, a6, 1
+; RV64V-NEXT:    add a7, a7, a6
+; RV64V-NEXT:    slli a6, a6, 1
+; RV64V-NEXT:    add a6, a6, a7
+; RV64V-NEXT:    add a6, sp, a6
+; RV64V-NEXT:    addi a6, a6, 32
+; RV64V-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, ra
+; RV64V-NEXT:    csrr a6, vlenb
+; RV64V-NEXT:    slli a6, a6, 3
+; RV64V-NEXT:    mv a7, a6
+; RV64V-NEXT:    slli a6, a6, 2
+; RV64V-NEXT:    add a7, a7, a6
+; RV64V-NEXT:    slli a6, a6, 1
+; RV64V-NEXT:    add a6, a6, a7
+; RV64V-NEXT:    add a6, sp, a6
+; RV64V-NEXT:    addi a6, a6, 32
+; RV64V-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, a5
+; RV64V-NEXT:    csrr a5, vlenb
+; RV64V-NEXT:    slli a5, a5, 5
+; RV64V-NEXT:    mv a6, a5
+; RV64V-NEXT:    slli a5, a5, 1
+; RV64V-NEXT:    add a5, a5, a6
+; RV64V-NEXT:    add a5, sp, a5
+; RV64V-NEXT:    addi a5, a5, 32
+; RV64V-NEXT:    vs8r.v v16, (a5) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, t6
+; RV64V-NEXT:    csrr a5, vlenb
+; RV64V-NEXT:    slli a5, a5, 3
+; RV64V-NEXT:    mv a6, a5
+; RV64V-NEXT:    slli a5, a5, 1
+; RV64V-NEXT:    add a6, a6, a5
+; RV64V-NEXT:    slli a5, a5, 2
+; RV64V-NEXT:    add a5, a5, a6
+; RV64V-NEXT:    add a5, sp, a5
+; RV64V-NEXT:    addi a5, a5, 32
+; RV64V-NEXT:    vs8r.v v16, (a5) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, a4
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 4
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vi v16, v0, 2
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vi v24, v0, 1
+; RV64V-NEXT:    vand.vi v16, v0, 4
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vi v16, v0, 8
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, t2
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 6
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, s5
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vmul.vv v16, v8, v24
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 4
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v24, v8, v16
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v0, v8, v16
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 4
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 4
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 4
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 5
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 8
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 4
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 4
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    addi a4, sp, 32
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 5
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 5
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 4
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 4
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 6
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 6
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 4
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 4
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 5
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 5
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 4
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 4
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 4
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 4
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 7
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 7
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 4
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 4
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 4
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 5
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 4
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 4
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 5
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 4
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 8
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 6
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v8, v8, v16
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 4
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vs8r.v v8, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 4
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v16, v16, v8
+; RV64V-NEXT:    vxor.vv v16, v16, v24
+; RV64V-NEXT:    vxor.vv v16, v16, v0
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v16, v16, v8
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v16, v16, v8
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v16, v16, v8
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 4
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v16, v16, v8
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v24, v16, v8
+; RV64V-NEXT:    addi a4, sp, 32
+; RV64V-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v24, v24, v8
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v24, v24, v8
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 5
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v24, v24, v8
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v24, v24, v8
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 4
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v24, v24, v8
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v24, v24, v8
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 6
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v24, v24, v8
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v24, v24, v8
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 4
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v24, v24, v8
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v24, v24, v8
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 5
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v24, v24, v8
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v24, v24, v8
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 4
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v24, v24, v8
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 4
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v24, v24, v8
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 7
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v24, v24, v8
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v0, v24, v8
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 4
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v0, v0, v8
+; RV64V-NEXT:    vsll.vi v16, v16, 24
+; RV64V-NEXT:    vand.vx v8, v24, a0
+; RV64V-NEXT:    vsll.vi v8, v8, 8
+; RV64V-NEXT:    vor.vv v16, v16, v8
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 4
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v8, v0, v8
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 4
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 4
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 5
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 8
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 4
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 32
+; RV64V-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vsrl.vi v24, v24, 8
+; RV64V-NEXT:    vand.vx v24, v24, a0
+; RV64V-NEXT:    vsrl.vi v8, v8, 24
+; RV64V-NEXT:    vor.vv v8, v24, v8
+; RV64V-NEXT:    vor.vv v8, v16, v8
+; RV64V-NEXT:    vsrl.vi v16, v8, 4
+; RV64V-NEXT:    vand.vx v8, v8, a3
+; RV64V-NEXT:    vand.vx v16, v16, a3
+; RV64V-NEXT:    vsll.vi v8, v8, 4
+; RV64V-NEXT:    vor.vv v8, v16, v8
+; RV64V-NEXT:    vsrl.vi v16, v8, 2
+; RV64V-NEXT:    vand.vx v8, v8, a2
+; RV64V-NEXT:    vand.vx v16, v16, a2
+; RV64V-NEXT:    vsll.vi v8, v8, 2
+; RV64V-NEXT:    vor.vv v8, v16, v8
+; RV64V-NEXT:    vsrl.vi v16, v8, 1
 ; RV64V-NEXT:    vand.vx v8, v8, a1
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    vmul.vv v8, v16, v8
-; RV64V-NEXT:    vxor.vv v16, v24, v8
-; RV64V-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; RV64V-NEXT:    vnsrl.wx v8, v16, a0
+; RV64V-NEXT:    vand.vx v16, v16, a1
+; RV64V-NEXT:    vadd.vv v8, v8, v8
+; RV64V-NEXT:    vor.vv v8, v16, v8
+; RV64V-NEXT:    vsrl.vi v8, v8, 1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add sp, sp, a0
+; RV64V-NEXT:    ld ra, 136(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s0, 128(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s1, 120(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s2, 112(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s3, 104(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s4, 96(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s5, 88(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s6, 80(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s7, 72(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s8, 64(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s9, 56(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s10, 48(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s11, 40(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    addi sp, sp, 144
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC64-LABEL: clmulh_nxv8i32_vx:
+; RV32ZVBC64-LABEL: clmulh_nxv16i32_vx:
 ; RV32ZVBC64:       # %bb.0:
-; RV32ZVBC64-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV32ZVBC64-NEXT:    vmv.v.x v24, a0
-; RV32ZVBC64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV32ZVBC64-NEXT:    vzext.vf2 v16, v8
-; RV32ZVBC64-NEXT:    vzext.vf2 v8, v24
-; RV32ZVBC64-NEXT:    vclmul.vv v16, v16, v8
-; RV32ZVBC64-NEXT:    li a0, 32
-; RV32ZVBC64-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; RV32ZVBC64-NEXT:    vnsrl.wx v8, v16, a0
+; RV32ZVBC64-NEXT:    addi sp, sp, -16
+; RV32ZVBC64-NEXT:    csrr a1, vlenb
+; RV32ZVBC64-NEXT:    slli a1, a1, 4
+; RV32ZVBC64-NEXT:    sub sp, sp, a1
+; RV32ZVBC64-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32ZVBC64-NEXT:    vsrl.vi v16, v8, 8
+; RV32ZVBC64-NEXT:    lui a4, 16
+; RV32ZVBC64-NEXT:    vsrl.vi v24, v8, 24
+; RV32ZVBC64-NEXT:    vsll.vi v0, v8, 24
+; RV32ZVBC64-NEXT:    lui a2, 61681
+; RV32ZVBC64-NEXT:    lui a5, 209715
+; RV32ZVBC64-NEXT:    lui a7, 349525
+; RV32ZVBC64-NEXT:    srli a3, a0, 8
+; RV32ZVBC64-NEXT:    srli a6, a0, 24
+; RV32ZVBC64-NEXT:    addi a1, a4, -256
+; RV32ZVBC64-NEXT:    and a3, a3, a1
+; RV32ZVBC64-NEXT:    or t0, a3, a6
+; RV32ZVBC64-NEXT:    slli a3, a0, 24
+; RV32ZVBC64-NEXT:    and a0, a0, a1
+; RV32ZVBC64-NEXT:    slli a0, a0, 8
+; RV32ZVBC64-NEXT:    or t1, a3, a0
+; RV32ZVBC64-NEXT:    li a6, 1
+; RV32ZVBC64-NEXT:    addi a3, a2, -241
+; RV32ZVBC64-NEXT:    addi a2, a5, 819
+; RV32ZVBC64-NEXT:    addi a0, a7, 1365
+; RV32ZVBC64-NEXT:    vand.vx v16, v16, a1
+; RV32ZVBC64-NEXT:    vand.vx v8, v8, a1
+; RV32ZVBC64-NEXT:    vor.vv v16, v16, v24
+; RV32ZVBC64-NEXT:    vsll.vi v8, v8, 8
+; RV32ZVBC64-NEXT:    vor.vv v8, v0, v8
+; RV32ZVBC64-NEXT:    vor.vv v8, v8, v16
+; RV32ZVBC64-NEXT:    or a5, t1, t0
+; RV32ZVBC64-NEXT:    vsrl.vi v16, v8, 4
+; RV32ZVBC64-NEXT:    vand.vx v8, v8, a3
+; RV32ZVBC64-NEXT:    srli a7, a5, 4
+; RV32ZVBC64-NEXT:    and a5, a5, a3
+; RV32ZVBC64-NEXT:    vand.vx v16, v16, a3
+; RV32ZVBC64-NEXT:    vsll.vi v8, v8, 4
+; RV32ZVBC64-NEXT:    and a7, a7, a3
+; RV32ZVBC64-NEXT:    slli a5, a5, 4
+; RV32ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV32ZVBC64-NEXT:    or a5, a7, a5
+; RV32ZVBC64-NEXT:    vsrl.vi v16, v8, 2
+; RV32ZVBC64-NEXT:    vand.vx v8, v8, a2
+; RV32ZVBC64-NEXT:    srli a7, a5, 2
+; RV32ZVBC64-NEXT:    and a5, a5, a2
+; RV32ZVBC64-NEXT:    vand.vx v16, v16, a2
+; RV32ZVBC64-NEXT:    vsll.vi v8, v8, 2
+; RV32ZVBC64-NEXT:    and a7, a7, a2
+; RV32ZVBC64-NEXT:    slli a5, a5, 2
+; RV32ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV32ZVBC64-NEXT:    or a5, a7, a5
+; RV32ZVBC64-NEXT:    vsrl.vi v16, v8, 1
+; RV32ZVBC64-NEXT:    vand.vx v8, v8, a0
+; RV32ZVBC64-NEXT:    srli a7, a5, 1
+; RV32ZVBC64-NEXT:    and a5, a5, a0
+; RV32ZVBC64-NEXT:    vand.vx v16, v16, a0
+; RV32ZVBC64-NEXT:    vadd.vv v8, v8, v8
+; RV32ZVBC64-NEXT:    and a7, a7, a0
+; RV32ZVBC64-NEXT:    slli a5, a5, 1
+; RV32ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV32ZVBC64-NEXT:    or a5, a7, a5
+; RV32ZVBC64-NEXT:    andi a7, a5, 2
+; RV32ZVBC64-NEXT:    vmul.vx v16, v8, a7
+; RV32ZVBC64-NEXT:    andi a7, a5, 1
+; RV32ZVBC64-NEXT:    vmul.vx v24, v8, a7
+; RV32ZVBC64-NEXT:    andi a7, a5, 4
+; RV32ZVBC64-NEXT:    vmul.vx v0, v8, a7
+; RV32ZVBC64-NEXT:    andi a7, a5, 8
+; RV32ZVBC64-NEXT:    vxor.vv v16, v24, v16
+; RV32ZVBC64-NEXT:    vmul.vx v24, v8, a7
+; RV32ZVBC64-NEXT:    andi a7, a5, 16
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v0
+; RV32ZVBC64-NEXT:    vmul.vx v0, v8, a7
+; RV32ZVBC64-NEXT:    andi a7, a5, 32
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC64-NEXT:    vmul.vx v24, v8, a7
+; RV32ZVBC64-NEXT:    andi a7, a5, 64
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v0
+; RV32ZVBC64-NEXT:    vmul.vx v0, v8, a7
+; RV32ZVBC64-NEXT:    andi a7, a5, 128
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC64-NEXT:    vmul.vx v24, v8, a7
+; RV32ZVBC64-NEXT:    andi a7, a5, 256
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v0
+; RV32ZVBC64-NEXT:    vmul.vx v0, v8, a7
+; RV32ZVBC64-NEXT:    andi a7, a5, 512
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v24
+; RV32ZVBC64-NEXT:    csrr t0, vlenb
+; RV32ZVBC64-NEXT:    slli t0, t0, 3
+; RV32ZVBC64-NEXT:    add t0, sp, t0
+; RV32ZVBC64-NEXT:    addi t0, t0, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (t0) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    vmul.vx v24, v8, a7
+; RV32ZVBC64-NEXT:    andi a7, a5, 1024
+; RV32ZVBC64-NEXT:    vxor.vv v0, v16, v0
+; RV32ZVBC64-NEXT:    vxor.vv v24, v0, v24
+; RV32ZVBC64-NEXT:    vmul.vx v0, v8, a7
+; RV32ZVBC64-NEXT:    lui a7, 1
+; RV32ZVBC64-NEXT:    slli a6, a6, 11
+; RV32ZVBC64-NEXT:    and a6, a5, a6
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vmul.vx v0, v8, a6
+; RV32ZVBC64-NEXT:    lui a6, 2
+; RV32ZVBC64-NEXT:    and a7, a5, a7
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vmul.vx v0, v8, a7
+; RV32ZVBC64-NEXT:    lui a7, 4
+; RV32ZVBC64-NEXT:    and a6, a5, a6
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vmul.vx v0, v8, a6
+; RV32ZVBC64-NEXT:    lui a6, 8
+; RV32ZVBC64-NEXT:    and a7, a5, a7
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vmul.vx v0, v8, a7
+; RV32ZVBC64-NEXT:    lui a7, 32
+; RV32ZVBC64-NEXT:    and a6, a5, a6
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vmul.vx v0, v8, a6
+; RV32ZVBC64-NEXT:    lui a6, 64
+; RV32ZVBC64-NEXT:    and a4, a5, a4
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vmul.vx v0, v8, a4
+; RV32ZVBC64-NEXT:    lui a4, 128
+; RV32ZVBC64-NEXT:    and a7, a5, a7
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vmul.vx v0, v8, a7
+; RV32ZVBC64-NEXT:    lui a7, 256
+; RV32ZVBC64-NEXT:    and a6, a5, a6
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vmul.vx v0, v8, a6
+; RV32ZVBC64-NEXT:    lui a6, 512
+; RV32ZVBC64-NEXT:    and a4, a5, a4
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vmul.vx v0, v8, a4
+; RV32ZVBC64-NEXT:    lui a4, 1024
+; RV32ZVBC64-NEXT:    and a7, a5, a7
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vmul.vx v0, v8, a7
+; RV32ZVBC64-NEXT:    lui a7, 2048
+; RV32ZVBC64-NEXT:    and a6, a5, a6
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vmul.vx v0, v8, a6
+; RV32ZVBC64-NEXT:    lui a6, 4096
+; RV32ZVBC64-NEXT:    and a4, a5, a4
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vmul.vx v0, v8, a4
+; RV32ZVBC64-NEXT:    lui a4, 8192
+; RV32ZVBC64-NEXT:    and a7, a5, a7
+; RV32ZVBC64-NEXT:    and a6, a5, a6
+; RV32ZVBC64-NEXT:    and a4, a5, a4
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vmul.vx v0, v8, a7
+; RV32ZVBC64-NEXT:    vxor.vv v24, v24, v0
+; RV32ZVBC64-NEXT:    vmul.vx v0, v8, a6
+; RV32ZVBC64-NEXT:    vxor.vv v0, v24, v0
+; RV32ZVBC64-NEXT:    vmul.vx v16, v8, a4
+; RV32ZVBC64-NEXT:    vxor.vv v16, v0, v16
+; RV32ZVBC64-NEXT:    addi a4, sp, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 3
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vsll.vi v16, v16, 24
+; RV32ZVBC64-NEXT:    vand.vx v0, v24, a1
+; RV32ZVBC64-NEXT:    vsll.vi v0, v0, 8
+; RV32ZVBC64-NEXT:    vor.vv v16, v16, v0
+; RV32ZVBC64-NEXT:    csrr a4, vlenb
+; RV32ZVBC64-NEXT:    slli a4, a4, 3
+; RV32ZVBC64-NEXT:    add a4, sp, a4
+; RV32ZVBC64-NEXT:    addi a4, a4, 16
+; RV32ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32ZVBC64-NEXT:    lui a4, 16384
+; RV32ZVBC64-NEXT:    and a4, a5, a4
+; RV32ZVBC64-NEXT:    vmul.vx v0, v8, a4
+; RV32ZVBC64-NEXT:    addi a4, sp, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vxor.vv v0, v16, v0
+; RV32ZVBC64-NEXT:    lui a4, 32768
+; RV32ZVBC64-NEXT:    and a4, a5, a4
+; RV32ZVBC64-NEXT:    vmul.vx v16, v8, a4
+; RV32ZVBC64-NEXT:    vxor.vv v16, v0, v16
+; RV32ZVBC64-NEXT:    lui a4, 65536
+; RV32ZVBC64-NEXT:    and a4, a5, a4
+; RV32ZVBC64-NEXT:    vmul.vx v0, v8, a4
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v0
+; RV32ZVBC64-NEXT:    lui a4, 131072
+; RV32ZVBC64-NEXT:    and a4, a5, a4
+; RV32ZVBC64-NEXT:    vmul.vx v0, v8, a4
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v0
+; RV32ZVBC64-NEXT:    lui a4, 262144
+; RV32ZVBC64-NEXT:    and a4, a5, a4
+; RV32ZVBC64-NEXT:    vmul.vx v0, v8, a4
+; RV32ZVBC64-NEXT:    vxor.vv v16, v16, v0
+; RV32ZVBC64-NEXT:    lui a4, 524288
+; RV32ZVBC64-NEXT:    and a4, a5, a4
+; RV32ZVBC64-NEXT:    vmul.vx v8, v8, a4
+; RV32ZVBC64-NEXT:    vxor.vv v8, v16, v8
+; RV32ZVBC64-NEXT:    vsrl.vi v16, v24, 8
+; RV32ZVBC64-NEXT:    vand.vx v16, v16, a1
+; RV32ZVBC64-NEXT:    vsrl.vi v8, v8, 24
+; RV32ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV32ZVBC64-NEXT:    csrr a1, vlenb
+; RV32ZVBC64-NEXT:    slli a1, a1, 3
+; RV32ZVBC64-NEXT:    add a1, sp, a1
+; RV32ZVBC64-NEXT:    addi a1, a1, 16
+; RV32ZVBC64-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; RV32ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV32ZVBC64-NEXT:    vsrl.vi v16, v8, 4
+; RV32ZVBC64-NEXT:    vand.vx v8, v8, a3
+; RV32ZVBC64-NEXT:    vand.vx v16, v16, a3
+; RV32ZVBC64-NEXT:    vsll.vi v8, v8, 4
+; RV32ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV32ZVBC64-NEXT:    vsrl.vi v16, v8, 2
+; RV32ZVBC64-NEXT:    vand.vx v8, v8, a2
+; RV32ZVBC64-NEXT:    vand.vx v16, v16, a2
+; RV32ZVBC64-NEXT:    vsll.vi v8, v8, 2
+; RV32ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV32ZVBC64-NEXT:    vsrl.vi v16, v8, 1
+; RV32ZVBC64-NEXT:    vand.vx v8, v8, a0
+; RV32ZVBC64-NEXT:    vand.vx v16, v16, a0
+; RV32ZVBC64-NEXT:    vadd.vv v8, v8, v8
+; RV32ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV32ZVBC64-NEXT:    vsrl.vi v8, v8, 1
+; RV32ZVBC64-NEXT:    csrr a0, vlenb
+; RV32ZVBC64-NEXT:    slli a0, a0, 4
+; RV32ZVBC64-NEXT:    add sp, sp, a0
+; RV32ZVBC64-NEXT:    addi sp, sp, 16
 ; RV32ZVBC64-NEXT:    ret
 ;
-; RV64ZVBC64-LABEL: clmulh_nxv8i32_vx:
+; RV64ZVBC64-LABEL: clmulh_nxv16i32_vx:
 ; RV64ZVBC64:       # %bb.0:
-; RV64ZVBC64-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV64ZVBC64-NEXT:    vmv.v.x v24, a0
-; RV64ZVBC64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV64ZVBC64-NEXT:    vzext.vf2 v16, v8
-; RV64ZVBC64-NEXT:    vzext.vf2 v8, v24
-; RV64ZVBC64-NEXT:    vclmul.vv v16, v16, v8
-; RV64ZVBC64-NEXT:    li a0, 32
-; RV64ZVBC64-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; RV64ZVBC64-NEXT:    vnsrl.wx v8, v16, a0
+; RV64ZVBC64-NEXT:    addi sp, sp, -144
+; RV64ZVBC64-NEXT:    sd ra, 136(sp) # 8-byte Folded Spill
+; RV64ZVBC64-NEXT:    sd s0, 128(sp) # 8-byte Folded Spill
+; RV64ZVBC64-NEXT:    sd s1, 120(sp) # 8-byte Folded Spill
+; RV64ZVBC64-NEXT:    sd s2, 112(sp) # 8-byte Folded Spill
+; RV64ZVBC64-NEXT:    sd s3, 104(sp) # 8-byte Folded Spill
+; RV64ZVBC64-NEXT:    sd s4, 96(sp) # 8-byte Folded Spill
+; RV64ZVBC64-NEXT:    sd s5, 88(sp) # 8-byte Folded Spill
+; RV64ZVBC64-NEXT:    sd s6, 80(sp) # 8-byte Folded Spill
+; RV64ZVBC64-NEXT:    sd s7, 72(sp) # 8-byte Folded Spill
+; RV64ZVBC64-NEXT:    sd s8, 64(sp) # 8-byte Folded Spill
+; RV64ZVBC64-NEXT:    sd s9, 56(sp) # 8-byte Folded Spill
+; RV64ZVBC64-NEXT:    sd s10, 48(sp) # 8-byte Folded Spill
+; RV64ZVBC64-NEXT:    sd s11, 40(sp) # 8-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a1, vlenb
+; RV64ZVBC64-NEXT:    slli a1, a1, 5
+; RV64ZVBC64-NEXT:    mv a2, a1
+; RV64ZVBC64-NEXT:    slli a1, a1, 3
+; RV64ZVBC64-NEXT:    add a1, a1, a2
+; RV64ZVBC64-NEXT:    sub sp, sp, a1
+; RV64ZVBC64-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV64ZVBC64-NEXT:    vmv.v.x v0, a0
+; RV64ZVBC64-NEXT:    vsrl.vi v16, v8, 8
+; RV64ZVBC64-NEXT:    lui a0, 16
+; RV64ZVBC64-NEXT:    vsrl.vi v24, v8, 24
+; RV64ZVBC64-NEXT:    addi a0, a0, -256
+; RV64ZVBC64-NEXT:    vand.vx v16, v16, a0
+; RV64ZVBC64-NEXT:    vor.vv v16, v16, v24
+; RV64ZVBC64-NEXT:    csrr a1, vlenb
+; RV64ZVBC64-NEXT:    slli a1, a1, 3
+; RV64ZVBC64-NEXT:    mv a2, a1
+; RV64ZVBC64-NEXT:    slli a1, a1, 1
+; RV64ZVBC64-NEXT:    add a2, a2, a1
+; RV64ZVBC64-NEXT:    slli a1, a1, 4
+; RV64ZVBC64-NEXT:    add a1, a1, a2
+; RV64ZVBC64-NEXT:    add a1, sp, a1
+; RV64ZVBC64-NEXT:    addi a1, a1, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vsrl.vi v24, v0, 8
+; RV64ZVBC64-NEXT:    vand.vx v24, v24, a0
+; RV64ZVBC64-NEXT:    vsrl.vi v16, v0, 24
+; RV64ZVBC64-NEXT:    vor.vv v24, v24, v16
+; RV64ZVBC64-NEXT:    vsll.vi v16, v8, 24
+; RV64ZVBC64-NEXT:    vand.vx v8, v8, a0
+; RV64ZVBC64-NEXT:    vsll.vi v8, v8, 8
+; RV64ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV64ZVBC64-NEXT:    vsll.vi v16, v0, 24
+; RV64ZVBC64-NEXT:    vand.vx v0, v0, a0
+; RV64ZVBC64-NEXT:    vsll.vi v0, v0, 8
+; RV64ZVBC64-NEXT:    vor.vv v0, v16, v0
+; RV64ZVBC64-NEXT:    lui a1, 61681
+; RV64ZVBC64-NEXT:    lui a2, 209715
+; RV64ZVBC64-NEXT:    lui t6, 349525
+; RV64ZVBC64-NEXT:    li s5, 16
+; RV64ZVBC64-NEXT:    li t2, 32
+; RV64ZVBC64-NEXT:    li a7, 256
+; RV64ZVBC64-NEXT:    li t0, 512
+; RV64ZVBC64-NEXT:    li t1, 1024
+; RV64ZVBC64-NEXT:    li s6, 1
+; RV64ZVBC64-NEXT:    lui t3, 1
+; RV64ZVBC64-NEXT:    lui t4, 2
+; RV64ZVBC64-NEXT:    lui t5, 4
+; RV64ZVBC64-NEXT:    lui s0, 8
+; RV64ZVBC64-NEXT:    lui s1, 32
+; RV64ZVBC64-NEXT:    lui s2, 64
+; RV64ZVBC64-NEXT:    lui s3, 128
+; RV64ZVBC64-NEXT:    lui s4, 256
+; RV64ZVBC64-NEXT:    lui s7, 512
+; RV64ZVBC64-NEXT:    lui s8, 1024
+; RV64ZVBC64-NEXT:    lui s9, 2048
+; RV64ZVBC64-NEXT:    lui s10, 4096
+; RV64ZVBC64-NEXT:    lui s11, 8192
+; RV64ZVBC64-NEXT:    lui ra, 16384
+; RV64ZVBC64-NEXT:    lui a5, 32768
+; RV64ZVBC64-NEXT:    addi a3, a1, -241
+; RV64ZVBC64-NEXT:    addi a2, a2, 819
+; RV64ZVBC64-NEXT:    addi a1, t6, 1365
+; RV64ZVBC64-NEXT:    vor.vv v16, v0, v24
+; RV64ZVBC64-NEXT:    vsrl.vi v24, v16, 4
+; RV64ZVBC64-NEXT:    vand.vx v16, v16, a3
+; RV64ZVBC64-NEXT:    vand.vx v24, v24, a3
+; RV64ZVBC64-NEXT:    vsll.vi v16, v16, 4
+; RV64ZVBC64-NEXT:    vor.vv v16, v24, v16
+; RV64ZVBC64-NEXT:    vsrl.vi v24, v16, 2
+; RV64ZVBC64-NEXT:    vand.vx v16, v16, a2
+; RV64ZVBC64-NEXT:    vand.vx v24, v24, a2
+; RV64ZVBC64-NEXT:    vsll.vi v16, v16, 2
+; RV64ZVBC64-NEXT:    vor.vv v16, v24, v16
+; RV64ZVBC64-NEXT:    vsrl.vi v24, v16, 1
+; RV64ZVBC64-NEXT:    vand.vx v16, v16, a1
+; RV64ZVBC64-NEXT:    vand.vx v24, v24, a1
+; RV64ZVBC64-NEXT:    vadd.vv v16, v16, v16
+; RV64ZVBC64-NEXT:    vor.vv v0, v24, v16
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, s5
+; RV64ZVBC64-NEXT:    sd a0, 8(sp) # 8-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a0, vlenb
+; RV64ZVBC64-NEXT:    slli a0, a0, 4
+; RV64ZVBC64-NEXT:    mv a4, a0
+; RV64ZVBC64-NEXT:    slli a0, a0, 4
+; RV64ZVBC64-NEXT:    add a0, a0, a4
+; RV64ZVBC64-NEXT:    add a0, sp, a0
+; RV64ZVBC64-NEXT:    addi a0, a0, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    lui t6, 65536
+; RV64ZVBC64-NEXT:    csrr a0, vlenb
+; RV64ZVBC64-NEXT:    slli a0, a0, 3
+; RV64ZVBC64-NEXT:    mv a4, a0
+; RV64ZVBC64-NEXT:    slli a0, a0, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a0
+; RV64ZVBC64-NEXT:    slli a0, a0, 4
+; RV64ZVBC64-NEXT:    add a0, a0, a4
+; RV64ZVBC64-NEXT:    add a0, sp, a0
+; RV64ZVBC64-NEXT:    addi a0, a0, 32
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vor.vv v8, v8, v16
+; RV64ZVBC64-NEXT:    vsrl.vi v16, v8, 4
+; RV64ZVBC64-NEXT:    vand.vx v8, v8, a3
+; RV64ZVBC64-NEXT:    vand.vx v16, v16, a3
+; RV64ZVBC64-NEXT:    vsll.vi v8, v8, 4
+; RV64ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV64ZVBC64-NEXT:    vsrl.vi v16, v8, 2
+; RV64ZVBC64-NEXT:    vand.vx v8, v8, a2
+; RV64ZVBC64-NEXT:    vand.vx v16, v16, a2
+; RV64ZVBC64-NEXT:    vsll.vi v8, v8, 2
+; RV64ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV64ZVBC64-NEXT:    vsrl.vi v16, v8, 1
+; RV64ZVBC64-NEXT:    vand.vx v8, v8, a1
+; RV64ZVBC64-NEXT:    vand.vx v16, v16, a1
+; RV64ZVBC64-NEXT:    vadd.vv v8, v8, v8
+; RV64ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, t2
+; RV64ZVBC64-NEXT:    csrr a0, vlenb
+; RV64ZVBC64-NEXT:    slli a0, a0, 3
+; RV64ZVBC64-NEXT:    mv a4, a0
+; RV64ZVBC64-NEXT:    slli a0, a0, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a0
+; RV64ZVBC64-NEXT:    slli a0, a0, 4
+; RV64ZVBC64-NEXT:    add a0, a0, a4
+; RV64ZVBC64-NEXT:    add a0, sp, a0
+; RV64ZVBC64-NEXT:    addi a0, a0, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    lui a4, 131072
+; RV64ZVBC64-NEXT:    slli s6, s6, 11
+; RV64ZVBC64-NEXT:    lui t2, 262144
+; RV64ZVBC64-NEXT:    lui s5, 524288
+; RV64ZVBC64-NEXT:    li a6, 64
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, a6
+; RV64ZVBC64-NEXT:    csrr a6, vlenb
+; RV64ZVBC64-NEXT:    slli a6, a6, 3
+; RV64ZVBC64-NEXT:    mv a0, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 5
+; RV64ZVBC64-NEXT:    add a6, a6, a0
+; RV64ZVBC64-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
+; RV64ZVBC64-NEXT:    add a6, sp, a6
+; RV64ZVBC64-NEXT:    addi a6, a6, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    li a6, 128
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, a6
+; RV64ZVBC64-NEXT:    csrr a6, vlenb
+; RV64ZVBC64-NEXT:    slli a6, a6, 8
+; RV64ZVBC64-NEXT:    add a6, sp, a6
+; RV64ZVBC64-NEXT:    addi a6, a6, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, a7
+; RV64ZVBC64-NEXT:    csrr a6, vlenb
+; RV64ZVBC64-NEXT:    slli a6, a6, 3
+; RV64ZVBC64-NEXT:    mv a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 1
+; RV64ZVBC64-NEXT:    add a7, a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 1
+; RV64ZVBC64-NEXT:    add a7, a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 1
+; RV64ZVBC64-NEXT:    add a7, a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 1
+; RV64ZVBC64-NEXT:    add a6, a6, a7
+; RV64ZVBC64-NEXT:    add a6, sp, a6
+; RV64ZVBC64-NEXT:    addi a6, a6, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, t0
+; RV64ZVBC64-NEXT:    csrr a6, vlenb
+; RV64ZVBC64-NEXT:    slli a6, a6, 4
+; RV64ZVBC64-NEXT:    mv a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 1
+; RV64ZVBC64-NEXT:    add a7, a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 1
+; RV64ZVBC64-NEXT:    add a7, a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 1
+; RV64ZVBC64-NEXT:    add a6, a6, a7
+; RV64ZVBC64-NEXT:    add a6, sp, a6
+; RV64ZVBC64-NEXT:    addi a6, a6, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, t1
+; RV64ZVBC64-NEXT:    csrr a6, vlenb
+; RV64ZVBC64-NEXT:    slli a6, a6, 3
+; RV64ZVBC64-NEXT:    mv a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 2
+; RV64ZVBC64-NEXT:    add a7, a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 1
+; RV64ZVBC64-NEXT:    add a7, a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 1
+; RV64ZVBC64-NEXT:    add a6, a6, a7
+; RV64ZVBC64-NEXT:    add a6, sp, a6
+; RV64ZVBC64-NEXT:    addi a6, a6, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, s6
+; RV64ZVBC64-NEXT:    csrr a6, vlenb
+; RV64ZVBC64-NEXT:    slli a6, a6, 5
+; RV64ZVBC64-NEXT:    mv a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 1
+; RV64ZVBC64-NEXT:    add a7, a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 1
+; RV64ZVBC64-NEXT:    add a6, a6, a7
+; RV64ZVBC64-NEXT:    add a6, sp, a6
+; RV64ZVBC64-NEXT:    addi a6, a6, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, t3
+; RV64ZVBC64-NEXT:    csrr a6, vlenb
+; RV64ZVBC64-NEXT:    slli a6, a6, 3
+; RV64ZVBC64-NEXT:    mv a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 1
+; RV64ZVBC64-NEXT:    add a7, a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 2
+; RV64ZVBC64-NEXT:    add a7, a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 1
+; RV64ZVBC64-NEXT:    add a6, a6, a7
+; RV64ZVBC64-NEXT:    add a6, sp, a6
+; RV64ZVBC64-NEXT:    addi a6, a6, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, t4
+; RV64ZVBC64-NEXT:    csrr a6, vlenb
+; RV64ZVBC64-NEXT:    slli a6, a6, 4
+; RV64ZVBC64-NEXT:    mv a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 2
+; RV64ZVBC64-NEXT:    add a7, a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 1
+; RV64ZVBC64-NEXT:    add a6, a6, a7
+; RV64ZVBC64-NEXT:    add a6, sp, a6
+; RV64ZVBC64-NEXT:    addi a6, a6, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, t5
+; RV64ZVBC64-NEXT:    csrr a6, vlenb
+; RV64ZVBC64-NEXT:    slli a6, a6, 3
+; RV64ZVBC64-NEXT:    mv a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 3
+; RV64ZVBC64-NEXT:    add a7, a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 1
+; RV64ZVBC64-NEXT:    add a6, a6, a7
+; RV64ZVBC64-NEXT:    add a6, sp, a6
+; RV64ZVBC64-NEXT:    addi a6, a6, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, s0
+; RV64ZVBC64-NEXT:    csrr a6, vlenb
+; RV64ZVBC64-NEXT:    slli a6, a6, 6
+; RV64ZVBC64-NEXT:    mv a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 1
+; RV64ZVBC64-NEXT:    add a6, a6, a7
+; RV64ZVBC64-NEXT:    add a6, sp, a6
+; RV64ZVBC64-NEXT:    addi a6, a6, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    lui a6, 16
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, a6
+; RV64ZVBC64-NEXT:    csrr a6, vlenb
+; RV64ZVBC64-NEXT:    slli a6, a6, 3
+; RV64ZVBC64-NEXT:    mv a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 1
+; RV64ZVBC64-NEXT:    add a7, a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 1
+; RV64ZVBC64-NEXT:    add a7, a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 2
+; RV64ZVBC64-NEXT:    add a6, a6, a7
+; RV64ZVBC64-NEXT:    add a6, sp, a6
+; RV64ZVBC64-NEXT:    addi a6, a6, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, s1
+; RV64ZVBC64-NEXT:    csrr a6, vlenb
+; RV64ZVBC64-NEXT:    slli a6, a6, 4
+; RV64ZVBC64-NEXT:    mv a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 1
+; RV64ZVBC64-NEXT:    add a7, a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 2
+; RV64ZVBC64-NEXT:    add a6, a6, a7
+; RV64ZVBC64-NEXT:    add a6, sp, a6
+; RV64ZVBC64-NEXT:    addi a6, a6, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, s2
+; RV64ZVBC64-NEXT:    csrr a6, vlenb
+; RV64ZVBC64-NEXT:    slli a6, a6, 3
+; RV64ZVBC64-NEXT:    mv a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 2
+; RV64ZVBC64-NEXT:    add a7, a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 2
+; RV64ZVBC64-NEXT:    add a6, a6, a7
+; RV64ZVBC64-NEXT:    add a6, sp, a6
+; RV64ZVBC64-NEXT:    addi a6, a6, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, s3
+; RV64ZVBC64-NEXT:    csrr a6, vlenb
+; RV64ZVBC64-NEXT:    slli a6, a6, 5
+; RV64ZVBC64-NEXT:    mv a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 2
+; RV64ZVBC64-NEXT:    add a6, a6, a7
+; RV64ZVBC64-NEXT:    add a6, sp, a6
+; RV64ZVBC64-NEXT:    addi a6, a6, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, s4
+; RV64ZVBC64-NEXT:    csrr a6, vlenb
+; RV64ZVBC64-NEXT:    slli a6, a6, 3
+; RV64ZVBC64-NEXT:    mv a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 1
+; RV64ZVBC64-NEXT:    add a7, a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 3
+; RV64ZVBC64-NEXT:    add a6, a6, a7
+; RV64ZVBC64-NEXT:    add a6, sp, a6
+; RV64ZVBC64-NEXT:    addi a6, a6, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, s7
+; RV64ZVBC64-NEXT:    csrr a6, vlenb
+; RV64ZVBC64-NEXT:    slli a6, a6, 4
+; RV64ZVBC64-NEXT:    mv a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 3
+; RV64ZVBC64-NEXT:    add a6, a6, a7
+; RV64ZVBC64-NEXT:    add a6, sp, a6
+; RV64ZVBC64-NEXT:    addi a6, a6, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, s8
+; RV64ZVBC64-NEXT:    csrr a6, vlenb
+; RV64ZVBC64-NEXT:    slli a6, a6, 3
+; RV64ZVBC64-NEXT:    mv a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 4
+; RV64ZVBC64-NEXT:    add a6, a6, a7
+; RV64ZVBC64-NEXT:    add a6, sp, a6
+; RV64ZVBC64-NEXT:    addi a6, a6, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, s9
+; RV64ZVBC64-NEXT:    csrr a6, vlenb
+; RV64ZVBC64-NEXT:    slli a6, a6, 7
+; RV64ZVBC64-NEXT:    add a6, sp, a6
+; RV64ZVBC64-NEXT:    addi a6, a6, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, s10
+; RV64ZVBC64-NEXT:    csrr a6, vlenb
+; RV64ZVBC64-NEXT:    slli a6, a6, 3
+; RV64ZVBC64-NEXT:    mv a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 1
+; RV64ZVBC64-NEXT:    add a7, a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 1
+; RV64ZVBC64-NEXT:    add a7, a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 1
+; RV64ZVBC64-NEXT:    add a6, a6, a7
+; RV64ZVBC64-NEXT:    add a6, sp, a6
+; RV64ZVBC64-NEXT:    addi a6, a6, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, s11
+; RV64ZVBC64-NEXT:    csrr a6, vlenb
+; RV64ZVBC64-NEXT:    slli a6, a6, 4
+; RV64ZVBC64-NEXT:    mv a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 1
+; RV64ZVBC64-NEXT:    add a7, a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 1
+; RV64ZVBC64-NEXT:    add a6, a6, a7
+; RV64ZVBC64-NEXT:    add a6, sp, a6
+; RV64ZVBC64-NEXT:    addi a6, a6, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, ra
+; RV64ZVBC64-NEXT:    csrr a6, vlenb
+; RV64ZVBC64-NEXT:    slli a6, a6, 3
+; RV64ZVBC64-NEXT:    mv a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 2
+; RV64ZVBC64-NEXT:    add a7, a7, a6
+; RV64ZVBC64-NEXT:    slli a6, a6, 1
+; RV64ZVBC64-NEXT:    add a6, a6, a7
+; RV64ZVBC64-NEXT:    add a6, sp, a6
+; RV64ZVBC64-NEXT:    addi a6, a6, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, a5
+; RV64ZVBC64-NEXT:    csrr a5, vlenb
+; RV64ZVBC64-NEXT:    slli a5, a5, 5
+; RV64ZVBC64-NEXT:    mv a6, a5
+; RV64ZVBC64-NEXT:    slli a5, a5, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a6
+; RV64ZVBC64-NEXT:    add a5, sp, a5
+; RV64ZVBC64-NEXT:    addi a5, a5, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a5) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, t6
+; RV64ZVBC64-NEXT:    csrr a5, vlenb
+; RV64ZVBC64-NEXT:    slli a5, a5, 3
+; RV64ZVBC64-NEXT:    mv a6, a5
+; RV64ZVBC64-NEXT:    slli a5, a5, 1
+; RV64ZVBC64-NEXT:    add a6, a6, a5
+; RV64ZVBC64-NEXT:    slli a5, a5, 2
+; RV64ZVBC64-NEXT:    add a5, a5, a6
+; RV64ZVBC64-NEXT:    add a5, sp, a5
+; RV64ZVBC64-NEXT:    addi a5, a5, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a5) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, a4
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 4
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 2
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vi v16, v0, 2
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vi v24, v0, 1
+; RV64ZVBC64-NEXT:    vand.vi v16, v0, 4
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 2
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vi v16, v0, 8
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, t2
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 6
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vand.vx v16, v0, s5
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v24
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 4
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 2
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v24, v8, v16
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v0, v8, v16
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 4
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 4
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 2
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 4
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 5
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 8
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 4
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 4
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    addi a4, sp, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 2
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 2
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 5
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 5
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 2
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 2
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 4
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 2
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 4
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 2
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 6
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 6
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 2
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 2
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 4
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 2
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 4
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 2
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 2
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 2
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 2
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 2
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 5
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 2
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 5
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 2
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 4
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 4
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 4
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 4
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 7
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 7
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 4
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 4
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 2
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 4
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 5
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 4
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 4
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 2
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 5
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 4
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 2
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 8
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 6
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v16, v8, v16
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vmul.vv v8, v8, v16
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 4
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vs8r.v v8, (a4) # vscale x 64-byte Folded Spill
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 4
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v16, v16, v8
+; RV64ZVBC64-NEXT:    vxor.vv v16, v16, v24
+; RV64ZVBC64-NEXT:    vxor.vv v16, v16, v0
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 2
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v16, v16, v8
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v16, v16, v8
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v16, v16, v8
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 4
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v16, v16, v8
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v24, v16, v8
+; RV64ZVBC64-NEXT:    addi a4, sp, 32
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v8
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 2
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v8
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 5
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v8
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 2
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v8
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 4
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 2
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v8
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v8
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 6
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v8
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 2
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v8
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 4
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 2
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v8
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 2
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 2
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v8
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 5
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 2
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v8
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v8
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 4
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v8
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 4
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v8
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 7
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v24, v24, v8
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v0, v24, v8
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 4
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v0, v0, v8
+; RV64ZVBC64-NEXT:    vsll.vi v16, v16, 24
+; RV64ZVBC64-NEXT:    vand.vx v8, v24, a0
+; RV64ZVBC64-NEXT:    vsll.vi v8, v8, 8
+; RV64ZVBC64-NEXT:    vor.vv v16, v16, v8
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 4
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v8, v0, v8
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 4
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 4
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v8, v8, v0
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 5
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v8, v8, v0
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 8
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v8, v8, v0
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 3
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v8, v8, v0
+; RV64ZVBC64-NEXT:    csrr a4, vlenb
+; RV64ZVBC64-NEXT:    slli a4, a4, 4
+; RV64ZVBC64-NEXT:    mv a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a5, a5, a4
+; RV64ZVBC64-NEXT:    slli a4, a4, 1
+; RV64ZVBC64-NEXT:    add a4, a4, a5
+; RV64ZVBC64-NEXT:    add a4, sp, a4
+; RV64ZVBC64-NEXT:    addi a4, a4, 32
+; RV64ZVBC64-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
+; RV64ZVBC64-NEXT:    vxor.vv v8, v8, v0
+; RV64ZVBC64-NEXT:    vsrl.vi v24, v24, 8
+; RV64ZVBC64-NEXT:    vand.vx v24, v24, a0
+; RV64ZVBC64-NEXT:    vsrl.vi v8, v8, 24
+; RV64ZVBC64-NEXT:    vor.vv v8, v24, v8
+; RV64ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV64ZVBC64-NEXT:    vsrl.vi v16, v8, 4
+; RV64ZVBC64-NEXT:    vand.vx v8, v8, a3
+; RV64ZVBC64-NEXT:    vand.vx v16, v16, a3
+; RV64ZVBC64-NEXT:    vsll.vi v8, v8, 4
+; RV64ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV64ZVBC64-NEXT:    vsrl.vi v16, v8, 2
+; RV64ZVBC64-NEXT:    vand.vx v8, v8, a2
+; RV64ZVBC64-NEXT:    vand.vx v16, v16, a2
+; RV64ZVBC64-NEXT:    vsll.vi v8, v8, 2
+; RV64ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV64ZVBC64-NEXT:    vsrl.vi v16, v8, 1
+; RV64ZVBC64-NEXT:    vand.vx v8, v8, a1
+; RV64ZVBC64-NEXT:    vand.vx v16, v16, a1
+; RV64ZVBC64-NEXT:    vadd.vv v8, v8, v8
+; RV64ZVBC64-NEXT:    vor.vv v8, v16, v8
+; RV64ZVBC64-NEXT:    vsrl.vi v8, v8, 1
+; RV64ZVBC64-NEXT:    csrr a0, vlenb
+; RV64ZVBC64-NEXT:    slli a0, a0, 5
+; RV64ZVBC64-NEXT:    mv a1, a0
+; RV64ZVBC64-NEXT:    slli a0, a0, 3
+; RV64ZVBC64-NEXT:    add a0, a0, a1
+; RV64ZVBC64-NEXT:    add sp, sp, a0
+; RV64ZVBC64-NEXT:    ld ra, 136(sp) # 8-byte Folded Reload
+; RV64ZVBC64-NEXT:    ld s0, 128(sp) # 8-byte Folded Reload
+; RV64ZVBC64-NEXT:    ld s1, 120(sp) # 8-byte Folded Reload
+; RV64ZVBC64-NEXT:    ld s2, 112(sp) # 8-byte Folded Reload
+; RV64ZVBC64-NEXT:    ld s3, 104(sp) # 8-byte Folded Reload
+; RV64ZVBC64-NEXT:    ld s4, 96(sp) # 8-byte Folded Reload
+; RV64ZVBC64-NEXT:    ld s5, 88(sp) # 8-byte Folded Reload
+; RV64ZVBC64-NEXT:    ld s6, 80(sp) # 8-byte Folded Reload
+; RV64ZVBC64-NEXT:    ld s7, 72(sp) # 8-byte Folded Reload
+; RV64ZVBC64-NEXT:    ld s8, 64(sp) # 8-byte Folded Reload
+; RV64ZVBC64-NEXT:    ld s9, 56(sp) # 8-byte Folded Reload
+; RV64ZVBC64-NEXT:    ld s10, 48(sp) # 8-byte Folded Reload
+; RV64ZVBC64-NEXT:    ld s11, 40(sp) # 8-byte Folded Reload
+; RV64ZVBC64-NEXT:    addi sp, sp, 144
 ; RV64ZVBC64-NEXT:    ret
 ;
-; RV32ZVBC32-LABEL: clmulh_nxv8i32_vx:
+; RV32ZVBC32-LABEL: clmulh_nxv16i32_vx:
 ; RV32ZVBC32:       # %bb.0:
-; RV32ZVBC32-NEXT:    addi sp, sp, -16
-; RV32ZVBC32-NEXT:    csrr a1, vlenb
-; RV32ZVBC32-NEXT:    slli a1, a1, 4
-; RV32ZVBC32-NEXT:    sub sp, sp, a1
-; RV32ZVBC32-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV32ZVBC32-NEXT:    vmv.v.x v24, a0
-; RV32ZVBC32-NEXT:    lui a0, 524288
-; RV32ZVBC32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV32ZVBC32-NEXT:    vzext.vf2 v16, v8
-; RV32ZVBC32-NEXT:    li a2, 16
-; RV32ZVBC32-NEXT:    li a1, 32
-; RV32ZVBC32-NEXT:    vzext.vf2 v8, v24
-; RV32ZVBC32-NEXT:    vand.vi v24, v8, 2
-; RV32ZVBC32-NEXT:    vand.vi v0, v8, 1
-; RV32ZVBC32-NEXT:    vmul.vv v24, v16, v24
-; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV32ZVBC32-NEXT:    vxor.vv v24, v0, v24
-; RV32ZVBC32-NEXT:    vand.vi v0, v8, 4
-; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
-; RV32ZVBC32-NEXT:    vand.vi v0, v8, 8
-; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
-; RV32ZVBC32-NEXT:    vand.vx v0, v8, a2
-; RV32ZVBC32-NEXT:    li a2, 64
-; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
-; RV32ZVBC32-NEXT:    vand.vx v0, v8, a1
-; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
-; RV32ZVBC32-NEXT:    vand.vx v0, v8, a2
-; RV32ZVBC32-NEXT:    li a2, 128
-; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
-; RV32ZVBC32-NEXT:    vand.vx v0, v8, a2
-; RV32ZVBC32-NEXT:    li a2, 256
-; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
-; RV32ZVBC32-NEXT:    vand.vx v0, v8, a2
-; RV32ZVBC32-NEXT:    li a2, 512
-; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
-; RV32ZVBC32-NEXT:    vand.vx v0, v8, a2
-; RV32ZVBC32-NEXT:    li a2, 1024
-; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
-; RV32ZVBC32-NEXT:    vand.vx v0, v8, a2
-; RV32ZVBC32-NEXT:    li a2, 1
-; RV32ZVBC32-NEXT:    slli a2, a2, 11
-; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
-; RV32ZVBC32-NEXT:    vand.vx v0, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 1
-; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
-; RV32ZVBC32-NEXT:    vand.vx v0, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 2
-; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
-; RV32ZVBC32-NEXT:    vand.vx v0, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 4
-; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
-; RV32ZVBC32-NEXT:    vand.vx v0, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 8
-; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
-; RV32ZVBC32-NEXT:    vand.vx v0, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 16
-; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
-; RV32ZVBC32-NEXT:    vand.vx v0, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 32
-; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
-; RV32ZVBC32-NEXT:    vand.vx v0, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 64
-; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
-; RV32ZVBC32-NEXT:    vand.vx v0, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 128
-; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
-; RV32ZVBC32-NEXT:    vand.vx v0, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 256
-; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
-; RV32ZVBC32-NEXT:    vand.vx v0, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 512
-; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
-; RV32ZVBC32-NEXT:    vand.vx v0, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 1024
-; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
-; RV32ZVBC32-NEXT:    vand.vx v0, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 2048
-; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
-; RV32ZVBC32-NEXT:    vand.vx v0, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 4096
-; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
-; RV32ZVBC32-NEXT:    vand.vx v0, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 8192
-; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
-; RV32ZVBC32-NEXT:    vand.vx v0, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 16384
-; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
-; RV32ZVBC32-NEXT:    vand.vx v0, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 32768
-; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
-; RV32ZVBC32-NEXT:    vand.vx v0, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 65536
-; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
-; RV32ZVBC32-NEXT:    vand.vx v0, v8, a2
-; RV32ZVBC32-NEXT:    lui a2, 131072
-; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV32ZVBC32-NEXT:    vxor.vv v24, v24, v0
-; RV32ZVBC32-NEXT:    csrr a3, vlenb
-; RV32ZVBC32-NEXT:    slli a3, a3, 3
-; RV32ZVBC32-NEXT:    add a3, sp, a3
-; RV32ZVBC32-NEXT:    addi a3, a3, 16
-; RV32ZVBC32-NEXT:    vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
-; RV32ZVBC32-NEXT:    vand.vx v0, v8, a2
-; RV32ZVBC32-NEXT:    addi a2, sp, 8
-; RV32ZVBC32-NEXT:    sw a0, 8(sp)
-; RV32ZVBC32-NEXT:    sw zero, 12(sp)
-; RV32ZVBC32-NEXT:    vlse64.v v24, (a2), zero
-; RV32ZVBC32-NEXT:    addi a0, sp, 16
-; RV32ZVBC32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV32ZVBC32-NEXT:    csrr a0, vlenb
-; RV32ZVBC32-NEXT:    slli a0, a0, 3
-; RV32ZVBC32-NEXT:    add a0, sp, a0
-; RV32ZVBC32-NEXT:    addi a0, a0, 16
-; RV32ZVBC32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32ZVBC32-NEXT:    vxor.vv v0, v24, v0
-; RV32ZVBC32-NEXT:    lui a0, 262144
-; RV32ZVBC32-NEXT:    addi a2, sp, 16
-; RV32ZVBC32-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; RV32ZVBC32-NEXT:    vand.vv v24, v8, v24
-; RV32ZVBC32-NEXT:    vand.vx v8, v8, a0
-; RV32ZVBC32-NEXT:    vmul.vv v8, v16, v8
-; RV32ZVBC32-NEXT:    vxor.vv v8, v0, v8
-; RV32ZVBC32-NEXT:    vmul.vv v16, v16, v24
-; RV32ZVBC32-NEXT:    vxor.vv v16, v8, v16
-; RV32ZVBC32-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; RV32ZVBC32-NEXT:    vnsrl.wx v8, v16, a1
-; RV32ZVBC32-NEXT:    csrr a0, vlenb
-; RV32ZVBC32-NEXT:    slli a0, a0, 4
-; RV32ZVBC32-NEXT:    add sp, sp, a0
-; RV32ZVBC32-NEXT:    addi sp, sp, 16
+; RV32ZVBC32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32ZVBC32-NEXT:    vclmulh.vx v8, v8, a0
 ; RV32ZVBC32-NEXT:    ret
 ;
-; RV64ZVBC32-LABEL: clmulh_nxv8i32_vx:
+; RV64ZVBC32-LABEL: clmulh_nxv16i32_vx:
 ; RV64ZVBC32:       # %bb.0:
-; RV64ZVBC32-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV64ZVBC32-NEXT:    vmv.v.x v24, a0
-; RV64ZVBC32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV64ZVBC32-NEXT:    vzext.vf2 v16, v8
-; RV64ZVBC32-NEXT:    li a1, 16
-; RV64ZVBC32-NEXT:    li a0, 32
-; RV64ZVBC32-NEXT:    li a2, 64
-; RV64ZVBC32-NEXT:    vzext.vf2 v8, v24
-; RV64ZVBC32-NEXT:    vand.vi v24, v8, 2
-; RV64ZVBC32-NEXT:    vand.vi v0, v8, 1
-; RV64ZVBC32-NEXT:    vmul.vv v24, v16, v24
-; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV64ZVBC32-NEXT:    vxor.vv v24, v0, v24
-; RV64ZVBC32-NEXT:    vand.vi v0, v8, 4
-; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
-; RV64ZVBC32-NEXT:    vand.vi v0, v8, 8
-; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
-; RV64ZVBC32-NEXT:    vand.vx v0, v8, a1
-; RV64ZVBC32-NEXT:    li a1, 128
-; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
-; RV64ZVBC32-NEXT:    vand.vx v0, v8, a0
-; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
-; RV64ZVBC32-NEXT:    vand.vx v0, v8, a2
-; RV64ZVBC32-NEXT:    li a2, 256
-; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
-; RV64ZVBC32-NEXT:    vand.vx v0, v8, a1
-; RV64ZVBC32-NEXT:    li a1, 512
-; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
-; RV64ZVBC32-NEXT:    vand.vx v0, v8, a2
-; RV64ZVBC32-NEXT:    li a2, 1024
-; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
-; RV64ZVBC32-NEXT:    vand.vx v0, v8, a1
-; RV64ZVBC32-NEXT:    li a1, 1
-; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
-; RV64ZVBC32-NEXT:    vand.vx v0, v8, a2
-; RV64ZVBC32-NEXT:    slli a2, a1, 11
-; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
-; RV64ZVBC32-NEXT:    vand.vx v0, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 1
-; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
-; RV64ZVBC32-NEXT:    vand.vx v0, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 2
-; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
-; RV64ZVBC32-NEXT:    vand.vx v0, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 4
-; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
-; RV64ZVBC32-NEXT:    vand.vx v0, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 8
-; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
-; RV64ZVBC32-NEXT:    vand.vx v0, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 16
-; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
-; RV64ZVBC32-NEXT:    vand.vx v0, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 32
-; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
-; RV64ZVBC32-NEXT:    vand.vx v0, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 64
-; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
-; RV64ZVBC32-NEXT:    vand.vx v0, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 128
-; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
-; RV64ZVBC32-NEXT:    vand.vx v0, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 256
-; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
-; RV64ZVBC32-NEXT:    vand.vx v0, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 512
-; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
-; RV64ZVBC32-NEXT:    vand.vx v0, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 1024
-; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
-; RV64ZVBC32-NEXT:    vand.vx v0, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 2048
-; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
-; RV64ZVBC32-NEXT:    vand.vx v0, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 4096
-; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
-; RV64ZVBC32-NEXT:    vand.vx v0, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 8192
-; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
-; RV64ZVBC32-NEXT:    vand.vx v0, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 16384
-; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
-; RV64ZVBC32-NEXT:    vand.vx v0, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 32768
-; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
-; RV64ZVBC32-NEXT:    vand.vx v0, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 65536
-; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
-; RV64ZVBC32-NEXT:    vand.vx v0, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 131072
-; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
-; RV64ZVBC32-NEXT:    vand.vx v0, v8, a2
-; RV64ZVBC32-NEXT:    lui a2, 262144
-; RV64ZVBC32-NEXT:    slli a1, a1, 31
-; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
-; RV64ZVBC32-NEXT:    vand.vx v0, v8, a2
-; RV64ZVBC32-NEXT:    vand.vx v8, v8, a1
-; RV64ZVBC32-NEXT:    vmul.vv v0, v16, v0
-; RV64ZVBC32-NEXT:    vxor.vv v24, v24, v0
-; RV64ZVBC32-NEXT:    vmul.vv v8, v16, v8
-; RV64ZVBC32-NEXT:    vxor.vv v16, v24, v8
-; RV64ZVBC32-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; RV64ZVBC32-NEXT:    vnsrl.wx v8, v16, a0
+; RV64ZVBC32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV64ZVBC32-NEXT:    vclmulh.vx v8, v8, a0
 ; RV64ZVBC32-NEXT:    ret
-  %elt.head = insertelement <vscale x 8 x i32> poison, i32 %b, i64 0
-  %vb = shufflevector <vscale x 8 x i32> %elt.head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
-  %va.ext = zext <vscale x 8 x i32> %va to <vscale x 8 x i64>
-  %vb.ext = zext <vscale x 8 x i32> %vb to <vscale x 8 x i64>
-  %clmul = call <vscale x 8 x i64> @llvm.clmul.nxv8i64(<vscale x 8 x i64> %va.ext, <vscale x 8 x i64> %vb.ext)
-  %res.ext = lshr <vscale x 8 x i64> %clmul, splat(i64 32)
-  %res = trunc <vscale x 8 x i64> %res.ext to <vscale x 8 x i32>
-  ret <vscale x 8 x i32> %res
-}
-
-define <vscale x 16 x i32> @clmulh_nxv16i32_vv(<vscale x 16 x i32> %va, <vscale x 16 x i32> %vb) nounwind {
-; RV32-LABEL: clmulh_nxv16i32_vv:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -80
-; RV32-NEXT:    sw ra, 76(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s0, 72(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s1, 68(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s2, 64(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s3, 60(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s4, 56(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s5, 52(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s6, 48(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s7, 44(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s8, 40(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s9, 36(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s10, 32(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s11, 28(sp) # 4-byte Folded Spill
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
-; RV32-NEXT:    mv a1, a0
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    sub sp, sp, a0
-; RV32-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v24, v8, 8
-; RV32-NEXT:    lui a0, 16
-; RV32-NEXT:    vsrl.vi v0, v8, 24
-; RV32-NEXT:    addi a0, a0, -256
-; RV32-NEXT:    vand.vx v24, v24, a0
-; RV32-NEXT:    vor.vv v24, v24, v0
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    mv a2, a1
-; RV32-NEXT:    slli a1, a1, 1
-; RV32-NEXT:    add a2, a2, a1
-; RV32-NEXT:    slli a1, a1, 4
-; RV32-NEXT:    add a1, a1, a2
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vsrl.vi v0, v16, 8
-; RV32-NEXT:    vand.vx v0, v0, a0
-; RV32-NEXT:    vsrl.vi v24, v16, 24
-; RV32-NEXT:    vor.vv v0, v0, v24
-; RV32-NEXT:    vsll.vi v24, v8, 24
-; RV32-NEXT:    vand.vx v8, v8, a0
-; RV32-NEXT:    vsll.vi v8, v8, 8
-; RV32-NEXT:    vor.vv v8, v24, v8
-; RV32-NEXT:    vsll.vi v24, v16, 24
-; RV32-NEXT:    vand.vx v16, v16, a0
-; RV32-NEXT:    vsll.vi v16, v16, 8
-; RV32-NEXT:    vor.vv v16, v24, v16
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    lui a2, 209715
-; RV32-NEXT:    lui t6, 349525
-; RV32-NEXT:    li s5, 16
-; RV32-NEXT:    li t2, 32
-; RV32-NEXT:    li a7, 256
-; RV32-NEXT:    li t0, 512
-; RV32-NEXT:    li t1, 1024
-; RV32-NEXT:    li s6, 1
-; RV32-NEXT:    lui t3, 1
-; RV32-NEXT:    lui t4, 2
-; RV32-NEXT:    lui t5, 4
-; RV32-NEXT:    lui s0, 8
-; RV32-NEXT:    lui s1, 32
-; RV32-NEXT:    lui s2, 64
-; RV32-NEXT:    lui s3, 128
-; RV32-NEXT:    lui s4, 256
-; RV32-NEXT:    lui s7, 512
-; RV32-NEXT:    lui s8, 1024
-; RV32-NEXT:    lui s9, 2048
-; RV32-NEXT:    lui s10, 4096
-; RV32-NEXT:    lui s11, 8192
-; RV32-NEXT:    lui ra, 16384
-; RV32-NEXT:    lui a5, 32768
-; RV32-NEXT:    addi a3, a1, -241
-; RV32-NEXT:    addi a2, a2, 819
-; RV32-NEXT:    addi a1, t6, 1365
-; RV32-NEXT:    vor.vv v16, v16, v0
-; RV32-NEXT:    vsrl.vi v24, v16, 4
-; RV32-NEXT:    vand.vx v16, v16, a3
-; RV32-NEXT:    vand.vx v24, v24, a3
-; RV32-NEXT:    vsll.vi v16, v16, 4
-; RV32-NEXT:    vor.vv v16, v24, v16
-; RV32-NEXT:    vsrl.vi v24, v16, 2
-; RV32-NEXT:    vand.vx v16, v16, a2
-; RV32-NEXT:    vand.vx v24, v24, a2
-; RV32-NEXT:    vsll.vi v16, v16, 2
-; RV32-NEXT:    vor.vv v16, v24, v16
-; RV32-NEXT:    vsrl.vi v24, v16, 1
-; RV32-NEXT:    vand.vx v16, v16, a1
-; RV32-NEXT:    vand.vx v24, v24, a1
-; RV32-NEXT:    vadd.vv v16, v16, v16
-; RV32-NEXT:    vor.vv v0, v24, v16
-; RV32-NEXT:    vand.vx v16, v0, s5
-; RV32-NEXT:    sw a0, 4(sp) # 4-byte Folded Spill
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    mv a4, a0
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add a0, a0, a4
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    lui t6, 65536
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    mv a4, a0
-; RV32-NEXT:    slli a0, a0, 1
-; RV32-NEXT:    add a4, a4, a0
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add a0, a0, a4
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 4
-; RV32-NEXT:    vand.vx v8, v8, a3
-; RV32-NEXT:    vand.vx v16, v16, a3
-; RV32-NEXT:    vsll.vi v8, v8, 4
-; RV32-NEXT:    vor.vv v8, v16, v8
-; RV32-NEXT:    vsrl.vi v16, v8, 2
-; RV32-NEXT:    vand.vx v8, v8, a2
-; RV32-NEXT:    vand.vx v16, v16, a2
-; RV32-NEXT:    vsll.vi v8, v8, 2
-; RV32-NEXT:    vor.vv v8, v16, v8
-; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    vand.vx v8, v8, a1
-; RV32-NEXT:    vand.vx v16, v16, a1
-; RV32-NEXT:    vadd.vv v8, v8, v8
-; RV32-NEXT:    vor.vv v8, v16, v8
-; RV32-NEXT:    vand.vx v16, v0, t2
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    mv a4, a0
-; RV32-NEXT:    slli a0, a0, 1
-; RV32-NEXT:    add a4, a4, a0
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add a0, a0, a4
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    lui a4, 131072
-; RV32-NEXT:    slli s6, s6, 11
-; RV32-NEXT:    lui t2, 262144
-; RV32-NEXT:    lui s5, 524288
-; RV32-NEXT:    li a6, 64
-; RV32-NEXT:    vand.vx v16, v0, a6
-; RV32-NEXT:    csrr a6, vlenb
-; RV32-NEXT:    slli a6, a6, 3
-; RV32-NEXT:    mv a0, a6
-; RV32-NEXT:    slli a6, a6, 5
-; RV32-NEXT:    add a6, a6, a0
-; RV32-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
-; RV32-NEXT:    add a6, sp, a6
-; RV32-NEXT:    addi a6, a6, 16
-; RV32-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    li a6, 128
-; RV32-NEXT:    vand.vx v16, v0, a6
-; RV32-NEXT:    csrr a6, vlenb
-; RV32-NEXT:    slli a6, a6, 8
-; RV32-NEXT:    add a6, sp, a6
-; RV32-NEXT:    addi a6, a6, 16
-; RV32-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vand.vx v16, v0, a7
-; RV32-NEXT:    csrr a6, vlenb
-; RV32-NEXT:    slli a6, a6, 3
-; RV32-NEXT:    mv a7, a6
-; RV32-NEXT:    slli a6, a6, 1
-; RV32-NEXT:    add a7, a7, a6
-; RV32-NEXT:    slli a6, a6, 1
-; RV32-NEXT:    add a7, a7, a6
-; RV32-NEXT:    slli a6, a6, 1
-; RV32-NEXT:    add a7, a7, a6
-; RV32-NEXT:    slli a6, a6, 1
-; RV32-NEXT:    add a6, a6, a7
-; RV32-NEXT:    add a6, sp, a6
-; RV32-NEXT:    addi a6, a6, 16
-; RV32-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vand.vx v16, v0, t0
-; RV32-NEXT:    csrr a6, vlenb
-; RV32-NEXT:    slli a6, a6, 4
-; RV32-NEXT:    mv a7, a6
-; RV32-NEXT:    slli a6, a6, 1
-; RV32-NEXT:    add a7, a7, a6
-; RV32-NEXT:    slli a6, a6, 1
-; RV32-NEXT:    add a7, a7, a6
-; RV32-NEXT:    slli a6, a6, 1
-; RV32-NEXT:    add a6, a6, a7
-; RV32-NEXT:    add a6, sp, a6
-; RV32-NEXT:    addi a6, a6, 16
-; RV32-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vand.vx v16, v0, t1
-; RV32-NEXT:    csrr a6, vlenb
-; RV32-NEXT:    slli a6, a6, 3
-; RV32-NEXT:    mv a7, a6
-; RV32-NEXT:    slli a6, a6, 2
-; RV32-NEXT:    add a7, a7, a6
-; RV32-NEXT:    slli a6, a6, 1
-; RV32-NEXT:    add a7, a7, a6
-; RV32-NEXT:    slli a6, a6, 1
-; RV32-NEXT:    add a6, a6, a7
-; RV32-NEXT:    add a6, sp, a6
-; RV32-NEXT:    addi a6, a6, 16
-; RV32-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vand.vx v16, v0, s6
-; RV32-NEXT:    csrr a6, vlenb
-; RV32-NEXT:    slli a6, a6, 5
-; RV32-NEXT:    mv a7, a6
-; RV32-NEXT:    slli a6, a6, 1
-; RV32-NEXT:    add a7, a7, a6
-; RV32-NEXT:    slli a6, a6, 1
-; RV32-NEXT:    add a6, a6, a7
-; RV32-NEXT:    add a6, sp, a6
-; RV32-NEXT:    addi a6, a6, 16
-; RV32-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vand.vx v16, v0, t3
-; RV32-NEXT:    csrr a6, vlenb
-; RV32-NEXT:    slli a6, a6, 3
-; RV32-NEXT:    mv a7, a6
-; RV32-NEXT:    slli a6, a6, 1
-; RV32-NEXT:    add a7, a7, a6
-; RV32-NEXT:    slli a6, a6, 2
-; RV32-NEXT:    add a7, a7, a6
-; RV32-NEXT:    slli a6, a6, 1
-; RV32-NEXT:    add a6, a6, a7
-; RV32-NEXT:    add a6, sp, a6
-; RV32-NEXT:    addi a6, a6, 16
-; RV32-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vand.vx v16, v0, t4
-; RV32-NEXT:    csrr a6, vlenb
-; RV32-NEXT:    slli a6, a6, 4
-; RV32-NEXT:    mv a7, a6
-; RV32-NEXT:    slli a6, a6, 2
-; RV32-NEXT:    add a7, a7, a6
-; RV32-NEXT:    slli a6, a6, 1
-; RV32-NEXT:    add a6, a6, a7
-; RV32-NEXT:    add a6, sp, a6
-; RV32-NEXT:    addi a6, a6, 16
-; RV32-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vand.vx v16, v0, t5
-; RV32-NEXT:    csrr a6, vlenb
-; RV32-NEXT:    slli a6, a6, 3
-; RV32-NEXT:    mv a7, a6
-; RV32-NEXT:    slli a6, a6, 3
-; RV32-NEXT:    add a7, a7, a6
-; RV32-NEXT:    slli a6, a6, 1
-; RV32-NEXT:    add a6, a6, a7
-; RV32-NEXT:    add a6, sp, a6
-; RV32-NEXT:    addi a6, a6, 16
-; RV32-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vand.vx v16, v0, s0
-; RV32-NEXT:    csrr a6, vlenb
-; RV32-NEXT:    slli a6, a6, 6
-; RV32-NEXT:    mv a7, a6
-; RV32-NEXT:    slli a6, a6, 1
-; RV32-NEXT:    add a6, a6, a7
-; RV32-NEXT:    add a6, sp, a6
-; RV32-NEXT:    addi a6, a6, 16
-; RV32-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    lui a6, 16
-; RV32-NEXT:    vand.vx v16, v0, a6
-; RV32-NEXT:    csrr a6, vlenb
-; RV32-NEXT:    slli a6, a6, 3
-; RV32-NEXT:    mv a7, a6
-; RV32-NEXT:    slli a6, a6, 1
-; RV32-NEXT:    add a7, a7, a6
-; RV32-NEXT:    slli a6, a6, 1
-; RV32-NEXT:    add a7, a7, a6
-; RV32-NEXT:    slli a6, a6, 2
-; RV32-NEXT:    add a6, a6, a7
-; RV32-NEXT:    add a6, sp, a6
-; RV32-NEXT:    addi a6, a6, 16
-; RV32-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vand.vx v16, v0, s1
-; RV32-NEXT:    csrr a6, vlenb
-; RV32-NEXT:    slli a6, a6, 4
-; RV32-NEXT:    mv a7, a6
-; RV32-NEXT:    slli a6, a6, 1
-; RV32-NEXT:    add a7, a7, a6
-; RV32-NEXT:    slli a6, a6, 2
-; RV32-NEXT:    add a6, a6, a7
-; RV32-NEXT:    add a6, sp, a6
-; RV32-NEXT:    addi a6, a6, 16
-; RV32-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vand.vx v16, v0, s2
-; RV32-NEXT:    csrr a6, vlenb
-; RV32-NEXT:    slli a6, a6, 3
-; RV32-NEXT:    mv a7, a6
-; RV32-NEXT:    slli a6, a6, 2
-; RV32-NEXT:    add a7, a7, a6
-; RV32-NEXT:    slli a6, a6, 2
-; RV32-NEXT:    add a6, a6, a7
-; RV32-NEXT:    add a6, sp, a6
-; RV32-NEXT:    addi a6, a6, 16
-; RV32-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vand.vx v16, v0, s3
-; RV32-NEXT:    csrr a6, vlenb
-; RV32-NEXT:    slli a6, a6, 5
-; RV32-NEXT:    mv a7, a6
-; RV32-NEXT:    slli a6, a6, 2
-; RV32-NEXT:    add a6, a6, a7
-; RV32-NEXT:    add a6, sp, a6
-; RV32-NEXT:    addi a6, a6, 16
-; RV32-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vand.vx v16, v0, s4
-; RV32-NEXT:    csrr a6, vlenb
-; RV32-NEXT:    slli a6, a6, 3
-; RV32-NEXT:    mv a7, a6
-; RV32-NEXT:    slli a6, a6, 1
-; RV32-NEXT:    add a7, a7, a6
-; RV32-NEXT:    slli a6, a6, 3
-; RV32-NEXT:    add a6, a6, a7
-; RV32-NEXT:    add a6, sp, a6
-; RV32-NEXT:    addi a6, a6, 16
-; RV32-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vand.vx v16, v0, s7
-; RV32-NEXT:    csrr a6, vlenb
-; RV32-NEXT:    slli a6, a6, 4
-; RV32-NEXT:    mv a7, a6
-; RV32-NEXT:    slli a6, a6, 3
-; RV32-NEXT:    add a6, a6, a7
-; RV32-NEXT:    add a6, sp, a6
-; RV32-NEXT:    addi a6, a6, 16
-; RV32-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vand.vx v16, v0, s8
-; RV32-NEXT:    csrr a6, vlenb
-; RV32-NEXT:    slli a6, a6, 3
-; RV32-NEXT:    mv a7, a6
-; RV32-NEXT:    slli a6, a6, 4
-; RV32-NEXT:    add a6, a6, a7
-; RV32-NEXT:    add a6, sp, a6
-; RV32-NEXT:    addi a6, a6, 16
-; RV32-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vand.vx v16, v0, s9
-; RV32-NEXT:    csrr a6, vlenb
-; RV32-NEXT:    slli a6, a6, 7
-; RV32-NEXT:    add a6, sp, a6
-; RV32-NEXT:    addi a6, a6, 16
-; RV32-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vand.vx v16, v0, s10
-; RV32-NEXT:    csrr a6, vlenb
-; RV32-NEXT:    slli a6, a6, 3
-; RV32-NEXT:    mv a7, a6
-; RV32-NEXT:    slli a6, a6, 1
-; RV32-NEXT:    add a7, a7, a6
-; RV32-NEXT:    slli a6, a6, 1
-; RV32-NEXT:    add a7, a7, a6
-; RV32-NEXT:    slli a6, a6, 1
-; RV32-NEXT:    add a6, a6, a7
-; RV32-NEXT:    add a6, sp, a6
-; RV32-NEXT:    addi a6, a6, 16
-; RV32-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vand.vx v16, v0, s11
-; RV32-NEXT:    csrr a6, vlenb
-; RV32-NEXT:    slli a6, a6, 4
-; RV32-NEXT:    mv a7, a6
-; RV32-NEXT:    slli a6, a6, 1
-; RV32-NEXT:    add a7, a7, a6
-; RV32-NEXT:    slli a6, a6, 1
-; RV32-NEXT:    add a6, a6, a7
-; RV32-NEXT:    add a6, sp, a6
-; RV32-NEXT:    addi a6, a6, 16
-; RV32-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vand.vx v16, v0, ra
-; RV32-NEXT:    csrr a6, vlenb
-; RV32-NEXT:    slli a6, a6, 3
-; RV32-NEXT:    mv a7, a6
-; RV32-NEXT:    slli a6, a6, 2
-; RV32-NEXT:    add a7, a7, a6
-; RV32-NEXT:    slli a6, a6, 1
-; RV32-NEXT:    add a6, a6, a7
-; RV32-NEXT:    add a6, sp, a6
-; RV32-NEXT:    addi a6, a6, 16
-; RV32-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vand.vx v16, v0, a5
-; RV32-NEXT:    csrr a5, vlenb
-; RV32-NEXT:    slli a5, a5, 5
-; RV32-NEXT:    mv a6, a5
-; RV32-NEXT:    slli a5, a5, 1
-; RV32-NEXT:    add a5, a5, a6
-; RV32-NEXT:    add a5, sp, a5
-; RV32-NEXT:    addi a5, a5, 16
-; RV32-NEXT:    vs8r.v v16, (a5) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vand.vx v16, v0, t6
-; RV32-NEXT:    csrr a5, vlenb
-; RV32-NEXT:    slli a5, a5, 3
-; RV32-NEXT:    mv a6, a5
-; RV32-NEXT:    slli a5, a5, 1
-; RV32-NEXT:    add a6, a6, a5
-; RV32-NEXT:    slli a5, a5, 2
-; RV32-NEXT:    add a5, a5, a6
-; RV32-NEXT:    add a5, sp, a5
-; RV32-NEXT:    addi a5, a5, 16
-; RV32-NEXT:    vs8r.v v16, (a5) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vand.vx v16, v0, a4
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 2
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vand.vi v16, v0, 2
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vand.vi v24, v0, 1
-; RV32-NEXT:    vand.vi v16, v0, 4
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 2
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vand.vi v16, v0, 8
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vand.vx v16, v0, t2
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 6
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vand.vx v16, v0, s5
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vmul.vv v16, v8, v16
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vmul.vv v16, v8, v24
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 2
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vmul.vv v24, v8, v16
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vmul.vv v0, v8, v16
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vmul.vv v16, v8, v16
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 2
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vmul.vv v16, v8, v16
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 5
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vmul.vv v16, v8, v16
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 8
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vmul.vv v16, v8, v16
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vmul.vv v16, v8, v16
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vmul.vv v16, v8, v16
-; RV32-NEXT:    addi a4, sp, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 2
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vmul.vv v16, v8, v16
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 2
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 5
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vmul.vv v16, v8, v16
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 5
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 2
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vmul.vv v16, v8, v16
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 2
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 2
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vmul.vv v16, v8, v16
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 2
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vmul.vv v16, v8, v16
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 6
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vmul.vv v16, v8, v16
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 6
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 2
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vmul.vv v16, v8, v16
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 2
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 2
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vmul.vv v16, v8, v16
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 2
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 2
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 2
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vmul.vv v16, v8, v16
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 2
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 2
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 5
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 2
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vmul.vv v16, v8, v16
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 5
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 2
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vmul.vv v16, v8, v16
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vmul.vv v16, v8, v16
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vmul.vv v16, v8, v16
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 7
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vmul.vv v16, v8, v16
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 7
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vmul.vv v16, v8, v16
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vmul.vv v16, v8, v16
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 2
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vmul.vv v16, v8, v16
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 5
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vmul.vv v16, v8, v16
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 2
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vmul.vv v16, v8, v16
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 5
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 2
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vmul.vv v16, v8, v16
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 8
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 6
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vmul.vv v16, v8, v16
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vmul.vv v8, v8, v16
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v8, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vxor.vv v16, v16, v8
-; RV32-NEXT:    vxor.vv v16, v16, v24
-; RV32-NEXT:    vxor.vv v16, v16, v0
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 2
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vxor.vv v16, v16, v8
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vxor.vv v16, v16, v8
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vxor.vv v16, v16, v8
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vxor.vv v16, v16, v8
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vxor.vv v24, v16, v8
-; RV32-NEXT:    addi a4, sp, 16
-; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vxor.vv v24, v24, v8
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 2
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vxor.vv v24, v24, v8
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 5
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vxor.vv v24, v24, v8
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 2
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vxor.vv v24, v24, v8
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 2
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vxor.vv v24, v24, v8
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vxor.vv v24, v24, v8
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 6
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vxor.vv v24, v24, v8
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 2
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vxor.vv v24, v24, v8
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 2
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vxor.vv v24, v24, v8
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 2
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 2
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vxor.vv v24, v24, v8
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 5
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 2
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vxor.vv v24, v24, v8
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vxor.vv v24, v24, v8
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vxor.vv v24, v24, v8
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vxor.vv v24, v24, v8
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 7
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vxor.vv v24, v24, v8
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vxor.vv v0, v24, v8
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vxor.vv v0, v0, v8
-; RV32-NEXT:    vsll.vi v16, v16, 24
-; RV32-NEXT:    vand.vx v8, v24, a0
-; RV32-NEXT:    vsll.vi v8, v8, 8
-; RV32-NEXT:    vor.vv v16, v16, v8
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vxor.vv v8, v0, v8
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vxor.vv v8, v8, v0
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 5
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vxor.vv v8, v8, v0
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 8
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vxor.vv v8, v8, v0
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vxor.vv v8, v8, v0
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vxor.vv v8, v8, v0
-; RV32-NEXT:    vsrl.vi v24, v24, 8
-; RV32-NEXT:    vand.vx v24, v24, a0
-; RV32-NEXT:    vsrl.vi v8, v8, 24
-; RV32-NEXT:    vor.vv v8, v24, v8
-; RV32-NEXT:    vor.vv v8, v16, v8
-; RV32-NEXT:    vsrl.vi v16, v8, 4
-; RV32-NEXT:    vand.vx v8, v8, a3
-; RV32-NEXT:    vand.vx v16, v16, a3
-; RV32-NEXT:    vsll.vi v8, v8, 4
-; RV32-NEXT:    vor.vv v8, v16, v8
-; RV32-NEXT:    vsrl.vi v16, v8, 2
-; RV32-NEXT:    vand.vx v8, v8, a2
-; RV32-NEXT:    vand.vx v16, v16, a2
-; RV32-NEXT:    vsll.vi v8, v8, 2
-; RV32-NEXT:    vor.vv v8, v16, v8
-; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    vand.vx v8, v8, a1
-; RV32-NEXT:    vand.vx v16, v16, a1
-; RV32-NEXT:    vadd.vv v8, v8, v8
-; RV32-NEXT:    vor.vv v8, v16, v8
-; RV32-NEXT:    vsrl.vi v8, v8, 1
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
-; RV32-NEXT:    mv a1, a0
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add sp, sp, a0
-; RV32-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s1, 68(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s2, 64(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s3, 60(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s4, 56(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s5, 52(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s6, 48(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s7, 44(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s8, 40(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s9, 36(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s10, 32(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s11, 28(sp) # 4-byte Folded Reload
-; RV32-NEXT:    addi sp, sp, 80
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: clmulh_nxv16i32_vv:
-; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -144
-; RV64-NEXT:    sd ra, 136(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s0, 128(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s1, 120(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s2, 112(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s3, 104(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s4, 96(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s5, 88(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s6, 80(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s7, 72(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s8, 64(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s9, 56(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s10, 48(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s11, 40(sp) # 8-byte Folded Spill
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 5
-; RV64-NEXT:    mv a1, a0
-; RV64-NEXT:    slli a0, a0, 3
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    sub sp, sp, a0
-; RV64-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v24, v8, 8
-; RV64-NEXT:    lui a0, 16
-; RV64-NEXT:    vsrl.vi v0, v8, 24
-; RV64-NEXT:    addi a0, a0, -256
-; RV64-NEXT:    vand.vx v24, v24, a0
-; RV64-NEXT:    vor.vv v24, v24, v0
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 3
-; RV64-NEXT:    mv a2, a1
-; RV64-NEXT:    slli a1, a1, 1
-; RV64-NEXT:    add a2, a2, a1
-; RV64-NEXT:    slli a1, a1, 4
-; RV64-NEXT:    add a1, a1, a2
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 32
-; RV64-NEXT:    vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vsrl.vi v0, v16, 8
-; RV64-NEXT:    vand.vx v0, v0, a0
-; RV64-NEXT:    vsrl.vi v24, v16, 24
-; RV64-NEXT:    vor.vv v0, v0, v24
-; RV64-NEXT:    vsll.vi v24, v8, 24
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vsll.vi v8, v8, 8
-; RV64-NEXT:    vor.vv v8, v24, v8
-; RV64-NEXT:    vsll.vi v24, v16, 24
-; RV64-NEXT:    vand.vx v16, v16, a0
-; RV64-NEXT:    vsll.vi v16, v16, 8
-; RV64-NEXT:    vor.vv v16, v24, v16
-; RV64-NEXT:    lui a1, 61681
-; RV64-NEXT:    lui a2, 209715
-; RV64-NEXT:    lui t6, 349525
-; RV64-NEXT:    li s5, 16
-; RV64-NEXT:    li t2, 32
-; RV64-NEXT:    li a7, 256
-; RV64-NEXT:    li t0, 512
-; RV64-NEXT:    li t1, 1024
-; RV64-NEXT:    li s6, 1
-; RV64-NEXT:    lui t3, 1
-; RV64-NEXT:    lui t4, 2
-; RV64-NEXT:    lui t5, 4
-; RV64-NEXT:    lui s0, 8
-; RV64-NEXT:    lui s1, 32
-; RV64-NEXT:    lui s2, 64
-; RV64-NEXT:    lui s3, 128
-; RV64-NEXT:    lui s4, 256
-; RV64-NEXT:    lui s7, 512
-; RV64-NEXT:    lui s8, 1024
-; RV64-NEXT:    lui s9, 2048
-; RV64-NEXT:    lui s10, 4096
-; RV64-NEXT:    lui s11, 8192
-; RV64-NEXT:    lui ra, 16384
-; RV64-NEXT:    lui a5, 32768
-; RV64-NEXT:    addi a3, a1, -241
-; RV64-NEXT:    addi a2, a2, 819
-; RV64-NEXT:    addi a1, t6, 1365
-; RV64-NEXT:    vor.vv v16, v16, v0
-; RV64-NEXT:    vsrl.vi v24, v16, 4
-; RV64-NEXT:    vand.vx v16, v16, a3
-; RV64-NEXT:    vand.vx v24, v24, a3
-; RV64-NEXT:    vsll.vi v16, v16, 4
-; RV64-NEXT:    vor.vv v16, v24, v16
-; RV64-NEXT:    vsrl.vi v24, v16, 2
-; RV64-NEXT:    vand.vx v16, v16, a2
-; RV64-NEXT:    vand.vx v24, v24, a2
-; RV64-NEXT:    vsll.vi v16, v16, 2
-; RV64-NEXT:    vor.vv v16, v24, v16
-; RV64-NEXT:    vsrl.vi v24, v16, 1
-; RV64-NEXT:    vand.vx v16, v16, a1
-; RV64-NEXT:    vand.vx v24, v24, a1
-; RV64-NEXT:    vadd.vv v16, v16, v16
-; RV64-NEXT:    vor.vv v0, v24, v16
-; RV64-NEXT:    vand.vx v16, v0, s5
-; RV64-NEXT:    sd a0, 8(sp) # 8-byte Folded Spill
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 4
-; RV64-NEXT:    mv a4, a0
-; RV64-NEXT:    slli a0, a0, 4
-; RV64-NEXT:    add a0, a0, a4
-; RV64-NEXT:    add a0, sp, a0
-; RV64-NEXT:    addi a0, a0, 32
-; RV64-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    lui t6, 65536
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 3
-; RV64-NEXT:    mv a4, a0
-; RV64-NEXT:    slli a0, a0, 1
-; RV64-NEXT:    add a4, a4, a0
-; RV64-NEXT:    slli a0, a0, 4
-; RV64-NEXT:    add a0, a0, a4
-; RV64-NEXT:    add a0, sp, a0
-; RV64-NEXT:    addi a0, a0, 32
-; RV64-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    vsrl.vi v16, v8, 4
-; RV64-NEXT:    vand.vx v8, v8, a3
-; RV64-NEXT:    vand.vx v16, v16, a3
-; RV64-NEXT:    vsll.vi v8, v8, 4
-; RV64-NEXT:    vor.vv v8, v16, v8
-; RV64-NEXT:    vsrl.vi v16, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a2
-; RV64-NEXT:    vand.vx v16, v16, a2
-; RV64-NEXT:    vsll.vi v8, v8, 2
-; RV64-NEXT:    vor.vv v8, v16, v8
-; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    vand.vx v8, v8, a1
-; RV64-NEXT:    vand.vx v16, v16, a1
-; RV64-NEXT:    vadd.vv v8, v8, v8
-; RV64-NEXT:    vor.vv v8, v16, v8
-; RV64-NEXT:    vand.vx v16, v0, t2
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 3
-; RV64-NEXT:    mv a4, a0
-; RV64-NEXT:    slli a0, a0, 1
-; RV64-NEXT:    add a4, a4, a0
-; RV64-NEXT:    slli a0, a0, 4
-; RV64-NEXT:    add a0, a0, a4
-; RV64-NEXT:    add a0, sp, a0
-; RV64-NEXT:    addi a0, a0, 32
-; RV64-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    lui a4, 131072
-; RV64-NEXT:    slli s6, s6, 11
-; RV64-NEXT:    lui t2, 262144
-; RV64-NEXT:    lui s5, 524288
-; RV64-NEXT:    li a6, 64
-; RV64-NEXT:    vand.vx v16, v0, a6
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 3
-; RV64-NEXT:    mv a0, a6
-; RV64-NEXT:    slli a6, a6, 5
-; RV64-NEXT:    add a6, a6, a0
-; RV64-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    li a6, 128
-; RV64-NEXT:    vand.vx v16, v0, a6
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 8
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, a7
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 3
-; RV64-NEXT:    mv a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, t0
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 4
-; RV64-NEXT:    mv a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, t1
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 3
-; RV64-NEXT:    mv a7, a6
-; RV64-NEXT:    slli a6, a6, 2
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, s6
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 5
-; RV64-NEXT:    mv a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, t3
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 3
-; RV64-NEXT:    mv a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 2
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, t4
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 4
-; RV64-NEXT:    mv a7, a6
-; RV64-NEXT:    slli a6, a6, 2
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, t5
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 3
-; RV64-NEXT:    mv a7, a6
-; RV64-NEXT:    slli a6, a6, 3
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, s0
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 6
-; RV64-NEXT:    mv a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    lui a6, 16
-; RV64-NEXT:    vand.vx v16, v0, a6
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 3
-; RV64-NEXT:    mv a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 2
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, s1
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 4
-; RV64-NEXT:    mv a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 2
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, s2
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 3
-; RV64-NEXT:    mv a7, a6
-; RV64-NEXT:    slli a6, a6, 2
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 2
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, s3
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 5
-; RV64-NEXT:    mv a7, a6
-; RV64-NEXT:    slli a6, a6, 2
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, s4
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 3
-; RV64-NEXT:    mv a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 3
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, s7
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 4
-; RV64-NEXT:    mv a7, a6
-; RV64-NEXT:    slli a6, a6, 3
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, s8
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 3
-; RV64-NEXT:    mv a7, a6
-; RV64-NEXT:    slli a6, a6, 4
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, s9
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, s10
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 3
-; RV64-NEXT:    mv a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, s11
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 4
-; RV64-NEXT:    mv a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, ra
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 3
-; RV64-NEXT:    mv a7, a6
-; RV64-NEXT:    slli a6, a6, 2
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, a5
-; RV64-NEXT:    csrr a5, vlenb
-; RV64-NEXT:    slli a5, a5, 5
-; RV64-NEXT:    mv a6, a5
-; RV64-NEXT:    slli a5, a5, 1
-; RV64-NEXT:    add a5, a5, a6
-; RV64-NEXT:    add a5, sp, a5
-; RV64-NEXT:    addi a5, a5, 32
-; RV64-NEXT:    vs8r.v v16, (a5) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, t6
-; RV64-NEXT:    csrr a5, vlenb
-; RV64-NEXT:    slli a5, a5, 3
-; RV64-NEXT:    mv a6, a5
-; RV64-NEXT:    slli a5, a5, 1
-; RV64-NEXT:    add a6, a6, a5
-; RV64-NEXT:    slli a5, a5, 2
-; RV64-NEXT:    add a5, a5, a6
-; RV64-NEXT:    add a5, sp, a5
-; RV64-NEXT:    addi a5, a5, 32
-; RV64-NEXT:    vs8r.v v16, (a5) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, a4
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vi v16, v0, 2
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vi v24, v0, 1
-; RV64-NEXT:    vand.vi v16, v0, 4
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vi v16, v0, 8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, t2
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 6
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, s5
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vmul.vv v16, v8, v24
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v24, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v0, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 5
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 8
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    addi a4, sp, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 5
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 5
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 6
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 6
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 5
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 5
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 7
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 7
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 5
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 5
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 8
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 6
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v8, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v8, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v16, v16, v8
-; RV64-NEXT:    vxor.vv v16, v16, v24
-; RV64-NEXT:    vxor.vv v16, v16, v0
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v16, v16, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v16, v16, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v16, v16, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v16, v16, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v24, v16, v8
-; RV64-NEXT:    addi a4, sp, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v24, v24, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v24, v24, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 5
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v24, v24, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v24, v24, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v24, v24, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v24, v24, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 6
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v24, v24, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v24, v24, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v24, v24, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v24, v24, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 5
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v24, v24, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v24, v24, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v24, v24, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v24, v24, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 7
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v24, v24, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v0, v24, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v0, v0, v8
-; RV64-NEXT:    vsll.vi v16, v16, 24
-; RV64-NEXT:    vand.vx v8, v24, a0
-; RV64-NEXT:    vsll.vi v8, v8, 8
-; RV64-NEXT:    vor.vv v16, v16, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v8, v0, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v8, v8, v0
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 5
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v8, v8, v0
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 8
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v8, v8, v0
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v8, v8, v0
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v8, v8, v0
-; RV64-NEXT:    vsrl.vi v24, v24, 8
-; RV64-NEXT:    vand.vx v24, v24, a0
-; RV64-NEXT:    vsrl.vi v8, v8, 24
-; RV64-NEXT:    vor.vv v8, v24, v8
-; RV64-NEXT:    vor.vv v8, v16, v8
-; RV64-NEXT:    vsrl.vi v16, v8, 4
-; RV64-NEXT:    vand.vx v8, v8, a3
-; RV64-NEXT:    vand.vx v16, v16, a3
-; RV64-NEXT:    vsll.vi v8, v8, 4
-; RV64-NEXT:    vor.vv v8, v16, v8
-; RV64-NEXT:    vsrl.vi v16, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a2
-; RV64-NEXT:    vand.vx v16, v16, a2
-; RV64-NEXT:    vsll.vi v8, v8, 2
-; RV64-NEXT:    vor.vv v8, v16, v8
-; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    vand.vx v8, v8, a1
-; RV64-NEXT:    vand.vx v16, v16, a1
-; RV64-NEXT:    vadd.vv v8, v8, v8
-; RV64-NEXT:    vor.vv v8, v16, v8
-; RV64-NEXT:    vsrl.vi v8, v8, 1
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 5
-; RV64-NEXT:    mv a1, a0
-; RV64-NEXT:    slli a0, a0, 3
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    add sp, sp, a0
-; RV64-NEXT:    ld ra, 136(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s0, 128(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s1, 120(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s2, 112(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s3, 104(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s4, 96(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s5, 88(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s6, 80(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s7, 72(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s8, 64(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s9, 56(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s10, 48(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s11, 40(sp) # 8-byte Folded Reload
-; RV64-NEXT:    addi sp, sp, 144
-; RV64-NEXT:    ret
-  %va.ext = zext <vscale x 16 x i32> %va to <vscale x 16 x i64>
-  %vb.ext = zext <vscale x 16 x i32> %vb to <vscale x 16 x i64>
-  %clmul = call <vscale x 16 x i64> @llvm.clmul.nxv16i64(<vscale x 16 x i64> %va.ext, <vscale x 16 x i64> %vb.ext)
-  %res.ext = lshr <vscale x 16 x i64> %clmul, splat(i64 32)
-  %res = trunc <vscale x 16 x i64> %res.ext to <vscale x 16 x i32>
-  ret <vscale x 16 x i32> %res
-}
-
-define <vscale x 16 x i32> @clmulh_nxv16i32_vx(<vscale x 16 x i32> %va, i32 %b) nounwind {
-; RV32-LABEL: clmulh_nxv16i32_vx:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
-; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 8
-; RV32-NEXT:    lui a4, 16
-; RV32-NEXT:    vsrl.vi v24, v8, 24
-; RV32-NEXT:    vsll.vi v0, v8, 24
-; RV32-NEXT:    lui a2, 61681
-; RV32-NEXT:    lui a5, 209715
-; RV32-NEXT:    lui a7, 349525
-; RV32-NEXT:    srli a3, a0, 8
-; RV32-NEXT:    srli a6, a0, 24
-; RV32-NEXT:    addi a1, a4, -256
-; RV32-NEXT:    and a3, a3, a1
-; RV32-NEXT:    or t0, a3, a6
-; RV32-NEXT:    slli a3, a0, 24
-; RV32-NEXT:    and a0, a0, a1
-; RV32-NEXT:    slli a0, a0, 8
-; RV32-NEXT:    or t1, a3, a0
-; RV32-NEXT:    li a6, 1
-; RV32-NEXT:    addi a3, a2, -241
-; RV32-NEXT:    addi a2, a5, 819
-; RV32-NEXT:    addi a0, a7, 1365
-; RV32-NEXT:    vand.vx v16, v16, a1
-; RV32-NEXT:    vand.vx v8, v8, a1
-; RV32-NEXT:    vor.vv v16, v16, v24
-; RV32-NEXT:    vsll.vi v8, v8, 8
-; RV32-NEXT:    vor.vv v8, v0, v8
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    or a5, t1, t0
-; RV32-NEXT:    vsrl.vi v16, v8, 4
-; RV32-NEXT:    vand.vx v8, v8, a3
-; RV32-NEXT:    srli a7, a5, 4
-; RV32-NEXT:    and a5, a5, a3
-; RV32-NEXT:    vand.vx v16, v16, a3
-; RV32-NEXT:    vsll.vi v8, v8, 4
-; RV32-NEXT:    and a7, a7, a3
-; RV32-NEXT:    slli a5, a5, 4
-; RV32-NEXT:    vor.vv v8, v16, v8
-; RV32-NEXT:    or a5, a7, a5
-; RV32-NEXT:    vsrl.vi v16, v8, 2
-; RV32-NEXT:    vand.vx v8, v8, a2
-; RV32-NEXT:    srli a7, a5, 2
-; RV32-NEXT:    and a5, a5, a2
-; RV32-NEXT:    vand.vx v16, v16, a2
-; RV32-NEXT:    vsll.vi v8, v8, 2
-; RV32-NEXT:    and a7, a7, a2
-; RV32-NEXT:    slli a5, a5, 2
-; RV32-NEXT:    vor.vv v8, v16, v8
-; RV32-NEXT:    or a5, a7, a5
-; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    vand.vx v8, v8, a0
-; RV32-NEXT:    srli a7, a5, 1
-; RV32-NEXT:    and a5, a5, a0
-; RV32-NEXT:    vand.vx v16, v16, a0
-; RV32-NEXT:    vadd.vv v8, v8, v8
-; RV32-NEXT:    and a7, a7, a0
-; RV32-NEXT:    slli a5, a5, 1
-; RV32-NEXT:    vor.vv v8, v16, v8
-; RV32-NEXT:    or a5, a7, a5
-; RV32-NEXT:    andi a7, a5, 2
-; RV32-NEXT:    vmul.vx v16, v8, a7
-; RV32-NEXT:    andi a7, a5, 1
-; RV32-NEXT:    vmul.vx v24, v8, a7
-; RV32-NEXT:    andi a7, a5, 4
-; RV32-NEXT:    vmul.vx v0, v8, a7
-; RV32-NEXT:    andi a7, a5, 8
-; RV32-NEXT:    vxor.vv v16, v24, v16
-; RV32-NEXT:    vmul.vx v24, v8, a7
-; RV32-NEXT:    andi a7, a5, 16
-; RV32-NEXT:    vxor.vv v16, v16, v0
-; RV32-NEXT:    vmul.vx v0, v8, a7
-; RV32-NEXT:    andi a7, a5, 32
-; RV32-NEXT:    vxor.vv v16, v16, v24
-; RV32-NEXT:    vmul.vx v24, v8, a7
-; RV32-NEXT:    andi a7, a5, 64
-; RV32-NEXT:    vxor.vv v16, v16, v0
-; RV32-NEXT:    vmul.vx v0, v8, a7
-; RV32-NEXT:    andi a7, a5, 128
-; RV32-NEXT:    vxor.vv v16, v16, v24
-; RV32-NEXT:    vmul.vx v24, v8, a7
-; RV32-NEXT:    andi a7, a5, 256
-; RV32-NEXT:    vxor.vv v16, v16, v0
-; RV32-NEXT:    vmul.vx v0, v8, a7
-; RV32-NEXT:    andi a7, a5, 512
-; RV32-NEXT:    vxor.vv v16, v16, v24
-; RV32-NEXT:    csrr t0, vlenb
-; RV32-NEXT:    slli t0, t0, 3
-; RV32-NEXT:    add t0, sp, t0
-; RV32-NEXT:    addi t0, t0, 16
-; RV32-NEXT:    vs8r.v v16, (t0) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vmul.vx v24, v8, a7
-; RV32-NEXT:    andi a7, a5, 1024
-; RV32-NEXT:    vxor.vv v0, v16, v0
-; RV32-NEXT:    vxor.vv v24, v0, v24
-; RV32-NEXT:    vmul.vx v0, v8, a7
-; RV32-NEXT:    lui a7, 1
-; RV32-NEXT:    slli a6, a6, 11
-; RV32-NEXT:    and a6, a5, a6
-; RV32-NEXT:    vxor.vv v24, v24, v0
-; RV32-NEXT:    vmul.vx v0, v8, a6
-; RV32-NEXT:    lui a6, 2
-; RV32-NEXT:    and a7, a5, a7
-; RV32-NEXT:    vxor.vv v24, v24, v0
-; RV32-NEXT:    vmul.vx v0, v8, a7
-; RV32-NEXT:    lui a7, 4
-; RV32-NEXT:    and a6, a5, a6
-; RV32-NEXT:    vxor.vv v24, v24, v0
-; RV32-NEXT:    vmul.vx v0, v8, a6
-; RV32-NEXT:    lui a6, 8
-; RV32-NEXT:    and a7, a5, a7
-; RV32-NEXT:    vxor.vv v24, v24, v0
-; RV32-NEXT:    vmul.vx v0, v8, a7
-; RV32-NEXT:    lui a7, 32
-; RV32-NEXT:    and a6, a5, a6
-; RV32-NEXT:    vxor.vv v24, v24, v0
-; RV32-NEXT:    vmul.vx v0, v8, a6
-; RV32-NEXT:    lui a6, 64
-; RV32-NEXT:    and a4, a5, a4
-; RV32-NEXT:    vxor.vv v24, v24, v0
-; RV32-NEXT:    vmul.vx v0, v8, a4
-; RV32-NEXT:    lui a4, 128
-; RV32-NEXT:    and a7, a5, a7
-; RV32-NEXT:    vxor.vv v24, v24, v0
-; RV32-NEXT:    vmul.vx v0, v8, a7
-; RV32-NEXT:    lui a7, 256
-; RV32-NEXT:    and a6, a5, a6
-; RV32-NEXT:    vxor.vv v24, v24, v0
-; RV32-NEXT:    vmul.vx v0, v8, a6
-; RV32-NEXT:    lui a6, 512
-; RV32-NEXT:    and a4, a5, a4
-; RV32-NEXT:    vxor.vv v24, v24, v0
-; RV32-NEXT:    vmul.vx v0, v8, a4
-; RV32-NEXT:    lui a4, 1024
-; RV32-NEXT:    and a7, a5, a7
-; RV32-NEXT:    vxor.vv v24, v24, v0
-; RV32-NEXT:    vmul.vx v0, v8, a7
-; RV32-NEXT:    lui a7, 2048
-; RV32-NEXT:    and a6, a5, a6
-; RV32-NEXT:    vxor.vv v24, v24, v0
-; RV32-NEXT:    vmul.vx v0, v8, a6
-; RV32-NEXT:    lui a6, 4096
-; RV32-NEXT:    and a4, a5, a4
-; RV32-NEXT:    vxor.vv v24, v24, v0
-; RV32-NEXT:    vmul.vx v0, v8, a4
-; RV32-NEXT:    lui a4, 8192
-; RV32-NEXT:    and a7, a5, a7
-; RV32-NEXT:    and a6, a5, a6
-; RV32-NEXT:    and a4, a5, a4
-; RV32-NEXT:    vxor.vv v24, v24, v0
-; RV32-NEXT:    vmul.vx v0, v8, a7
-; RV32-NEXT:    vxor.vv v24, v24, v0
-; RV32-NEXT:    vmul.vx v0, v8, a6
-; RV32-NEXT:    vxor.vv v0, v24, v0
-; RV32-NEXT:    vmul.vx v16, v8, a4
-; RV32-NEXT:    vxor.vv v16, v0, v16
-; RV32-NEXT:    addi a4, sp, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vsll.vi v16, v16, 24
-; RV32-NEXT:    vand.vx v0, v24, a1
-; RV32-NEXT:    vsll.vi v0, v0, 8
-; RV32-NEXT:    vor.vv v16, v16, v0
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    lui a4, 16384
-; RV32-NEXT:    and a4, a5, a4
-; RV32-NEXT:    vmul.vx v0, v8, a4
-; RV32-NEXT:    addi a4, sp, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vxor.vv v0, v16, v0
-; RV32-NEXT:    lui a4, 32768
-; RV32-NEXT:    and a4, a5, a4
-; RV32-NEXT:    vmul.vx v16, v8, a4
-; RV32-NEXT:    vxor.vv v16, v0, v16
-; RV32-NEXT:    lui a4, 65536
-; RV32-NEXT:    and a4, a5, a4
-; RV32-NEXT:    vmul.vx v0, v8, a4
-; RV32-NEXT:    vxor.vv v16, v16, v0
-; RV32-NEXT:    lui a4, 131072
-; RV32-NEXT:    and a4, a5, a4
-; RV32-NEXT:    vmul.vx v0, v8, a4
-; RV32-NEXT:    vxor.vv v16, v16, v0
-; RV32-NEXT:    lui a4, 262144
-; RV32-NEXT:    and a4, a5, a4
-; RV32-NEXT:    vmul.vx v0, v8, a4
-; RV32-NEXT:    vxor.vv v16, v16, v0
-; RV32-NEXT:    lui a4, 524288
-; RV32-NEXT:    and a4, a5, a4
-; RV32-NEXT:    vmul.vx v8, v8, a4
-; RV32-NEXT:    vxor.vv v8, v16, v8
-; RV32-NEXT:    vsrl.vi v16, v24, 8
-; RV32-NEXT:    vand.vx v16, v16, a1
-; RV32-NEXT:    vsrl.vi v8, v8, 24
-; RV32-NEXT:    vor.vv v8, v16, v8
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vor.vv v8, v16, v8
-; RV32-NEXT:    vsrl.vi v16, v8, 4
-; RV32-NEXT:    vand.vx v8, v8, a3
-; RV32-NEXT:    vand.vx v16, v16, a3
-; RV32-NEXT:    vsll.vi v8, v8, 4
-; RV32-NEXT:    vor.vv v8, v16, v8
-; RV32-NEXT:    vsrl.vi v16, v8, 2
-; RV32-NEXT:    vand.vx v8, v8, a2
-; RV32-NEXT:    vand.vx v16, v16, a2
-; RV32-NEXT:    vsll.vi v8, v8, 2
-; RV32-NEXT:    vor.vv v8, v16, v8
-; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    vand.vx v8, v8, a0
-; RV32-NEXT:    vand.vx v16, v16, a0
-; RV32-NEXT:    vadd.vv v8, v8, v8
-; RV32-NEXT:    vor.vv v8, v16, v8
-; RV32-NEXT:    vsrl.vi v8, v8, 1
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add sp, sp, a0
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: clmulh_nxv16i32_vx:
-; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -144
-; RV64-NEXT:    sd ra, 136(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s0, 128(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s1, 120(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s2, 112(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s3, 104(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s4, 96(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s5, 88(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s6, 80(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s7, 72(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s8, 64(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s9, 56(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s10, 48(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s11, 40(sp) # 8-byte Folded Spill
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 5
-; RV64-NEXT:    mv a2, a1
-; RV64-NEXT:    slli a1, a1, 3
-; RV64-NEXT:    add a1, a1, a2
-; RV64-NEXT:    sub sp, sp, a1
-; RV64-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; RV64-NEXT:    vmv.v.x v0, a0
-; RV64-NEXT:    vsrl.vi v16, v8, 8
-; RV64-NEXT:    lui a0, 16
-; RV64-NEXT:    vsrl.vi v24, v8, 24
-; RV64-NEXT:    addi a0, a0, -256
-; RV64-NEXT:    vand.vx v16, v16, a0
-; RV64-NEXT:    vor.vv v16, v16, v24
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 3
-; RV64-NEXT:    mv a2, a1
-; RV64-NEXT:    slli a1, a1, 1
-; RV64-NEXT:    add a2, a2, a1
-; RV64-NEXT:    slli a1, a1, 4
-; RV64-NEXT:    add a1, a1, a2
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 32
-; RV64-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vsrl.vi v24, v0, 8
-; RV64-NEXT:    vand.vx v24, v24, a0
-; RV64-NEXT:    vsrl.vi v16, v0, 24
-; RV64-NEXT:    vor.vv v24, v24, v16
-; RV64-NEXT:    vsll.vi v16, v8, 24
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vsll.vi v8, v8, 8
-; RV64-NEXT:    vor.vv v8, v16, v8
-; RV64-NEXT:    vsll.vi v16, v0, 24
-; RV64-NEXT:    vand.vx v0, v0, a0
-; RV64-NEXT:    vsll.vi v0, v0, 8
-; RV64-NEXT:    vor.vv v0, v16, v0
-; RV64-NEXT:    lui a1, 61681
-; RV64-NEXT:    lui a2, 209715
-; RV64-NEXT:    lui t6, 349525
-; RV64-NEXT:    li s5, 16
-; RV64-NEXT:    li t2, 32
-; RV64-NEXT:    li a7, 256
-; RV64-NEXT:    li t0, 512
-; RV64-NEXT:    li t1, 1024
-; RV64-NEXT:    li s6, 1
-; RV64-NEXT:    lui t3, 1
-; RV64-NEXT:    lui t4, 2
-; RV64-NEXT:    lui t5, 4
-; RV64-NEXT:    lui s0, 8
-; RV64-NEXT:    lui s1, 32
-; RV64-NEXT:    lui s2, 64
-; RV64-NEXT:    lui s3, 128
-; RV64-NEXT:    lui s4, 256
-; RV64-NEXT:    lui s7, 512
-; RV64-NEXT:    lui s8, 1024
-; RV64-NEXT:    lui s9, 2048
-; RV64-NEXT:    lui s10, 4096
-; RV64-NEXT:    lui s11, 8192
-; RV64-NEXT:    lui ra, 16384
-; RV64-NEXT:    lui a5, 32768
-; RV64-NEXT:    addi a3, a1, -241
-; RV64-NEXT:    addi a2, a2, 819
-; RV64-NEXT:    addi a1, t6, 1365
-; RV64-NEXT:    vor.vv v16, v0, v24
-; RV64-NEXT:    vsrl.vi v24, v16, 4
-; RV64-NEXT:    vand.vx v16, v16, a3
-; RV64-NEXT:    vand.vx v24, v24, a3
-; RV64-NEXT:    vsll.vi v16, v16, 4
-; RV64-NEXT:    vor.vv v16, v24, v16
-; RV64-NEXT:    vsrl.vi v24, v16, 2
-; RV64-NEXT:    vand.vx v16, v16, a2
-; RV64-NEXT:    vand.vx v24, v24, a2
-; RV64-NEXT:    vsll.vi v16, v16, 2
-; RV64-NEXT:    vor.vv v16, v24, v16
-; RV64-NEXT:    vsrl.vi v24, v16, 1
-; RV64-NEXT:    vand.vx v16, v16, a1
-; RV64-NEXT:    vand.vx v24, v24, a1
-; RV64-NEXT:    vadd.vv v16, v16, v16
-; RV64-NEXT:    vor.vv v0, v24, v16
-; RV64-NEXT:    vand.vx v16, v0, s5
-; RV64-NEXT:    sd a0, 8(sp) # 8-byte Folded Spill
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 4
-; RV64-NEXT:    mv a4, a0
-; RV64-NEXT:    slli a0, a0, 4
-; RV64-NEXT:    add a0, a0, a4
-; RV64-NEXT:    add a0, sp, a0
-; RV64-NEXT:    addi a0, a0, 32
-; RV64-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    lui t6, 65536
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 3
-; RV64-NEXT:    mv a4, a0
-; RV64-NEXT:    slli a0, a0, 1
-; RV64-NEXT:    add a4, a4, a0
-; RV64-NEXT:    slli a0, a0, 4
-; RV64-NEXT:    add a0, a0, a4
-; RV64-NEXT:    add a0, sp, a0
-; RV64-NEXT:    addi a0, a0, 32
-; RV64-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    vsrl.vi v16, v8, 4
-; RV64-NEXT:    vand.vx v8, v8, a3
-; RV64-NEXT:    vand.vx v16, v16, a3
-; RV64-NEXT:    vsll.vi v8, v8, 4
-; RV64-NEXT:    vor.vv v8, v16, v8
-; RV64-NEXT:    vsrl.vi v16, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a2
-; RV64-NEXT:    vand.vx v16, v16, a2
-; RV64-NEXT:    vsll.vi v8, v8, 2
-; RV64-NEXT:    vor.vv v8, v16, v8
-; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    vand.vx v8, v8, a1
-; RV64-NEXT:    vand.vx v16, v16, a1
-; RV64-NEXT:    vadd.vv v8, v8, v8
-; RV64-NEXT:    vor.vv v8, v16, v8
-; RV64-NEXT:    vand.vx v16, v0, t2
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 3
-; RV64-NEXT:    mv a4, a0
-; RV64-NEXT:    slli a0, a0, 1
-; RV64-NEXT:    add a4, a4, a0
-; RV64-NEXT:    slli a0, a0, 4
-; RV64-NEXT:    add a0, a0, a4
-; RV64-NEXT:    add a0, sp, a0
-; RV64-NEXT:    addi a0, a0, 32
-; RV64-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    lui a4, 131072
-; RV64-NEXT:    slli s6, s6, 11
-; RV64-NEXT:    lui t2, 262144
-; RV64-NEXT:    lui s5, 524288
-; RV64-NEXT:    li a6, 64
-; RV64-NEXT:    vand.vx v16, v0, a6
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 3
-; RV64-NEXT:    mv a0, a6
-; RV64-NEXT:    slli a6, a6, 5
-; RV64-NEXT:    add a6, a6, a0
-; RV64-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    li a6, 128
-; RV64-NEXT:    vand.vx v16, v0, a6
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 8
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, a7
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 3
-; RV64-NEXT:    mv a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, t0
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 4
-; RV64-NEXT:    mv a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, t1
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 3
-; RV64-NEXT:    mv a7, a6
-; RV64-NEXT:    slli a6, a6, 2
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, s6
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 5
-; RV64-NEXT:    mv a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, t3
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 3
-; RV64-NEXT:    mv a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 2
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, t4
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 4
-; RV64-NEXT:    mv a7, a6
-; RV64-NEXT:    slli a6, a6, 2
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, t5
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 3
-; RV64-NEXT:    mv a7, a6
-; RV64-NEXT:    slli a6, a6, 3
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, s0
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 6
-; RV64-NEXT:    mv a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    lui a6, 16
-; RV64-NEXT:    vand.vx v16, v0, a6
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 3
-; RV64-NEXT:    mv a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 2
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, s1
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 4
-; RV64-NEXT:    mv a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 2
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, s2
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 3
-; RV64-NEXT:    mv a7, a6
-; RV64-NEXT:    slli a6, a6, 2
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 2
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, s3
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 5
-; RV64-NEXT:    mv a7, a6
-; RV64-NEXT:    slli a6, a6, 2
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, s4
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 3
-; RV64-NEXT:    mv a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 3
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, s7
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 4
-; RV64-NEXT:    mv a7, a6
-; RV64-NEXT:    slli a6, a6, 3
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, s8
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 3
-; RV64-NEXT:    mv a7, a6
-; RV64-NEXT:    slli a6, a6, 4
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, s9
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, s10
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 3
-; RV64-NEXT:    mv a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, s11
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 4
-; RV64-NEXT:    mv a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, ra
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 3
-; RV64-NEXT:    mv a7, a6
-; RV64-NEXT:    slli a6, a6, 2
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, a5
-; RV64-NEXT:    csrr a5, vlenb
-; RV64-NEXT:    slli a5, a5, 5
-; RV64-NEXT:    mv a6, a5
-; RV64-NEXT:    slli a5, a5, 1
-; RV64-NEXT:    add a5, a5, a6
-; RV64-NEXT:    add a5, sp, a5
-; RV64-NEXT:    addi a5, a5, 32
-; RV64-NEXT:    vs8r.v v16, (a5) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, t6
-; RV64-NEXT:    csrr a5, vlenb
-; RV64-NEXT:    slli a5, a5, 3
-; RV64-NEXT:    mv a6, a5
-; RV64-NEXT:    slli a5, a5, 1
-; RV64-NEXT:    add a6, a6, a5
-; RV64-NEXT:    slli a5, a5, 2
-; RV64-NEXT:    add a5, a5, a6
-; RV64-NEXT:    add a5, sp, a5
-; RV64-NEXT:    addi a5, a5, 32
-; RV64-NEXT:    vs8r.v v16, (a5) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, a4
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vi v16, v0, 2
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vi v24, v0, 1
-; RV64-NEXT:    vand.vi v16, v0, 4
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vi v16, v0, 8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, t2
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 6
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, s5
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vmul.vv v16, v8, v24
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v24, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v0, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 5
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 8
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    addi a4, sp, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 5
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 5
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 6
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 6
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 5
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 5
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 7
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 7
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 5
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 5
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 8
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 6
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v8, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v8, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v16, v16, v8
-; RV64-NEXT:    vxor.vv v16, v16, v24
-; RV64-NEXT:    vxor.vv v16, v16, v0
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v16, v16, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v16, v16, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v16, v16, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v16, v16, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v24, v16, v8
-; RV64-NEXT:    addi a4, sp, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v24, v24, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v24, v24, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 5
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v24, v24, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v24, v24, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v24, v24, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v24, v24, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 6
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v24, v24, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v24, v24, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v24, v24, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v24, v24, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 5
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v24, v24, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v24, v24, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v24, v24, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v24, v24, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 7
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v24, v24, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v0, v24, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v0, v0, v8
-; RV64-NEXT:    vsll.vi v16, v16, 24
-; RV64-NEXT:    vand.vx v8, v24, a0
-; RV64-NEXT:    vsll.vi v8, v8, 8
-; RV64-NEXT:    vor.vv v16, v16, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v8, v0, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v8, v8, v0
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 5
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v8, v8, v0
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 8
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v8, v8, v0
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v8, v8, v0
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v8, v8, v0
-; RV64-NEXT:    vsrl.vi v24, v24, 8
-; RV64-NEXT:    vand.vx v24, v24, a0
-; RV64-NEXT:    vsrl.vi v8, v8, 24
-; RV64-NEXT:    vor.vv v8, v24, v8
-; RV64-NEXT:    vor.vv v8, v16, v8
-; RV64-NEXT:    vsrl.vi v16, v8, 4
-; RV64-NEXT:    vand.vx v8, v8, a3
-; RV64-NEXT:    vand.vx v16, v16, a3
-; RV64-NEXT:    vsll.vi v8, v8, 4
-; RV64-NEXT:    vor.vv v8, v16, v8
-; RV64-NEXT:    vsrl.vi v16, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a2
-; RV64-NEXT:    vand.vx v16, v16, a2
-; RV64-NEXT:    vsll.vi v8, v8, 2
-; RV64-NEXT:    vor.vv v8, v16, v8
-; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    vand.vx v8, v8, a1
-; RV64-NEXT:    vand.vx v16, v16, a1
-; RV64-NEXT:    vadd.vv v8, v8, v8
-; RV64-NEXT:    vor.vv v8, v16, v8
-; RV64-NEXT:    vsrl.vi v8, v8, 1
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 5
-; RV64-NEXT:    mv a1, a0
-; RV64-NEXT:    slli a0, a0, 3
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    add sp, sp, a0
-; RV64-NEXT:    ld ra, 136(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s0, 128(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s1, 120(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s2, 112(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s3, 104(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s4, 96(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s5, 88(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s6, 80(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s7, 72(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s8, 64(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s9, 56(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s10, 48(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s11, 40(sp) # 8-byte Folded Reload
-; RV64-NEXT:    addi sp, sp, 144
-; RV64-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x i32> poison, i32 %b, i64 0
   %vb = shufflevector <vscale x 16 x i32> %elt.head, <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
   %va.ext = zext <vscale x 16 x i32> %va to <vscale x 16 x i64>
@@ -86107,5 +92820,8 @@ define <vscale x 1 x i64> @clmulh_nxv1i64_vx_mask(<vscale x 1 x i64> %va, i64 %b
   ret <vscale x 1 x i64> %sel
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
+; RV32: {{.*}}
 ; RV32ZVBC: {{.*}}
+; RV64: {{.*}}
 ; RV64ZVBC: {{.*}}

>From 805e7d02f6950fe496b2ef06089b7ea9a28b6776 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper at sifive.com>
Date: Tue, 3 Mar 2026 21:20:43 -0800
Subject: [PATCH 3/3] fixup! remove unnecessary check

---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 768dac1eaa373..446edc21dd3d1 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1123,14 +1123,13 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
       } else {
         if (Subtarget.hasStdExtZvbc32e()) {
           setOperationAction({ISD::CLMUL, ISD::CLMULH}, VT, Legal);
-        } else if (Subtarget.hasStdExtZvbc() && Subtarget.hasVInstructionsI64()) {
+        } else if (Subtarget.hasStdExtZvbc()) {
           // Promote to i64 if the lmul is small enough.
           // FIXME: Split if necessary to widen.
           // FIXME: Promote clmulh directly without legalizing to clmul first.
           MVT I64VecVT = MVT::getVectorVT(MVT::i64, VT.getVectorElementCount());
           if (isTypeLegal(I64VecVT))
             setOperationAction(ISD::CLMUL, VT, Custom);
-
         }
       }
 



More information about the llvm-commits mailing list